Our Project Team 4 above (Banu Boopalan, Samuel Kigamba, James Mundy, Alain T Kuiete), we will submit 2 separate RPUB documents. The 2nd document link to RPUBS, we have performed data transformations, exploratory data analysis, visualizations using wordclouds, frequency plots on words, and performed SVM model and reported the Confusion Matrix results for the SVM model. We tried to plot the model using plot but we were not successful in representing a way to plot the model, The support vector #’s are high range so we have to dive deeper into how to represent and plot the model through plot or Kernlab pacakge or Kernfit. Within the model we are able to create document term matrix and term document matrix, segment the train and test data and then run the model to report summary model. The SVM reported an accuracy for each of our teammates will be different as we are reading in our own files from the directory. The SVM reported higher accuracy than the Naive Bayes upon first review.
Collaboration via POWERPOINT, GITHUB, GOTO MEETING along with weekly meetings on Tuesday, Friday.
We have utilized SVM model in this project4 code (Our first code that produced uses . Our approach for this project follows:
Our approach for this project follows:
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
ham.dir="C:\\DATA607\\Project4\\spamHam\\20021010_easy_ham (1).tar\\easy_ham"
ham.file.names = list.files(ham.dir)
# List of docs
ham.docs <- ham.file.names[1]
for(i in 2:length(ham.file.names))
{
filepath<-paste0(ham.dir, "/", ham.file.names[i])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham.docs = c(ham.docs,list1)
}sender.df %>%
group_by(email) %>%
summarise(n=n())%>%
top_n(10)%>%
mutate(email = reorder(email, n)) %>%
ggplot(aes(email, n, fill = email)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent Senders",
x = NULL) +
coord_flip()## Selecting by n
emails <- unlist(str_extract_all(ham.docs[2],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(ham.docs)) {
s <- unlist(str_extract_all(ham.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
emails <- c(emails, s)
}
summary(emails)## Length Class Mode
## 46500 character character
ham.emails %>%
group_by(emails) %>%
summarise(n=n())%>%
top_n(20)%>%
mutate(emails = reorder(emails, n)) %>%
ggplot(aes(emails, n, fill = emails)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent emails",
x = NULL) +
coord_flip()## Selecting by n
## Warning in bind_tf_idf.data.frame(., word, files, n): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
We select only words with IDF greater than 0 and we remove words containg numbers
ham.block2 <- ham.block %>%
filter(idf>0,str_detect(word,"([^\\d.+\\w.+\\.\\,.+]+?)")) %>%
arrange(desc(tf_idf))
#ham.block2## # A tibble: 4 x 6
## files word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1795 laptop's 60 0.0167 6.46 0.108
## 2 1300 laptop's 620 0.00161 6.46 0.0104
## 3 1336 laptop's 645 0.00155 6.46 0.0100
## 4 1301 laptop's 826 0.00121 6.46 0.00782
ham.block2%>%
arrange(desc(tf_idf)) %>%
top_n(20)%>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
ggplot(aes(word, tf_idf, fill = files)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf", title = "Most Relevent Words in the Body Messages") +
coord_flip()## Selecting by tf_idf
spam.dir="C:\\DATA607\\Project4\\spamHam\\spam4\\spam_2"
spam.file.names = list.files(spam.dir)
# List of docs
spam.docs <- spam.file.names[1]
for(i in 2:length(spam.file.names))
{
filepath<-paste0(spam.dir, "\\", spam.file.names[i])
text <-readLines(filepath)
l<- list(paste(text, collapse="\n"))
spam.docs = c(spam.docs,l)
}## Warning in bind_tf_idf.data.frame(., word, block, n): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
spam.senders <- unlist(str_extract(spam.docs[2], "(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(spam.docs)) {
s <- unlist(str_extract(spam.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
spam.senders <- c(spam.senders, s)
}
summary(spam.senders)## Length Class Mode
## 1396 character character
## [1] "lmrn@mailexcite.com" "amknight@mailexcite.com"
## [3] "jordan23@mailexcite.com" "merchantsworld2001@juno.com"
## [5] "cypherpunks-forward@ds.pro-ns.net" "sales@outsrc-em.com"
spam.email.len <- nchar(spam.senders[1])
for (i in 2:length(spam.senders)) {
spam.email.len <-c(spam.email.len,nchar(spam.senders[i]))
}
spam.sender.df <- tibble(email=spam.senders, len=spam.email.len)
spam.sender.df## # A tibble: 1,396 x 2
## email len
## <chr> <int>
## 1 lmrn@mailexcite.com 19
## 2 amknight@mailexcite.com 23
## 3 jordan23@mailexcite.com 23
## 4 merchantsworld2001@juno.com 27
## 5 cypherpunks-forward@ds.pro-ns.net 33
## 6 sales@outsrc-em.com 19
## 7 ormlh@imail.ru 14
## 8 spamassassin-sightings-admin@lists.sourceforge.net 50
## 9 fork-admin@xent.com 19
## 10 bduyisj36648@Email.cz 21
## # ... with 1,386 more rows
spam.sender.df %>%
group_by(email) %>%
summarise(n=n())%>%
top_n(10)%>%
mutate(email = reorder(email, n)) %>%
ggplot(aes(email, n, fill = email)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent Senders",
x = NULL) +
coord_flip()## Selecting by n
spam.emails <- unlist(str_extract_all(spam.docs[2],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(spam.docs)) {
s <- unlist(str_extract_all(spam.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
spam.emails <- c(spam.emails, s)
}
summary(spam.emails)## Length Class Mode
## 22103 character character
len <- nchar(spam.emails[1])
for (i in 2:length(spam.emails)) {
len <-c(len, nchar(spam.emails[i]))
}
spam.emails <- tibble(mail = 1:length(spam.emails), spam.emails, len)
#spam.emailsspam.emails %>%
group_by(spam.emails) %>%
summarise(n=n())%>%
top_n(20)%>%
mutate(spam.emails = reorder(spam.emails, n)) %>%
ggplot(aes(spam.emails, n, fill = spam.emails)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent emails",
x = NULL) +
coord_flip()## Selecting by n
spam.block2%>%
top_n(10)%>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
mutate(block = reorder(block, tf_idf)) %>%
arrange(desc(tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = block)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf", title = "Most Relevent Words in the Bodies of Spam Email") +
coord_flip()## Selecting by tf_idf
create_corpus <- function(dir, label){
corpus <- VCorpus(DirSource(dir)) %>%
tm_map(PlainTextDocument) %>%
tm_map(content_transformer(tolower)) %>% #
tm_map(removeWords, stopwords("SMART")) %>%
tm_map(removePunctuation) %>% #
tm_map(removeNumbers) %>% #
tm_map(stripWhitespace) %>% #
tm_map(stemDocument) #
meta(corpus, "LABEL") <- label
return(corpus)
}
#data/easy_ham_2 <- "C:\\DATA607\\Project4\\spamHam\\20021010_easy_ham (1).tar\\easy_ham"
#data/spam_2<- "C:\\DATA607\\Project4\\spamHam\\20021010_spam.tar\\spam"
corpus<- c(create_corpus("C:\\DATA607\\Project4\\spamHam\\spam3\\spam", "Spam"), create_corpus("C:\\DATA607\\Project4\\spamHam\\20021010_easy_ham (1).tar\\easy_ham", "Ham"))#Only Keep Words found in at least 15 documents
min_docs <- 15
dtm <- removeSparseTerms(dtm, 1 - (min_docs / length(corpus)))
model_data <- as.matrix(dtm)
words <- rowSums(model_data)
model_data <- model_data / words
model_data <- data.frame(model_data)
model_data <- cbind(meta(corpus), model_data) %>%
mutate(LABEL = as.factor(LABEL))library(kableExtra)
table(predictions, testing_data$LABEL) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))| Ham | Spam | |
|---|---|---|
| Ham | 636 | 15 |
| Spam | 1 | 110 |