library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.2
library(qdap)
## Warning: package 'qdap' was built under R version 4.4.2
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 4.4.2
##
## Attaching package: 'qdapRegex'
##
## The following object is masked from 'package:dplyr':
##
## explain
##
## The following object is masked from 'package:ggplot2':
##
## %+%
##
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 4.4.2
##
## Attaching package: 'qdapTools'
##
## The following object is masked from 'package:dplyr':
##
## id
##
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
##
## The following objects are masked from 'package:base':
##
## Filter, proportions
library("tm")
## Warning: package 'tm' was built under R version 4.4.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.4.2
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:qdap':
##
## ngrams
##
## The following object is masked from 'package:ggplot2':
##
## annotate
##
##
## Attaching package: 'tm'
##
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(stringi)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/ ***
# Load data from spamassassin.apache.org/old/publiccorpus/20021010_easy_ham and
# ```{r}
# download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
# download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")
#read data
spam_directory = "C:/Users/asadn/Desktop/All/SPS/Fall-2024/DATA-607/Assignment/week14-project4/20050311_spam_2"
ham_directory = "C:/Users/asadn/Desktop/All/SPS/Fall-2024/DATA-607/Assignment/week14-project4/20021010_easy_ham"
dfun <- function(path, tag){
files <- list.files(path=path,
full.names=TRUE,
recursive=TRUE)
email <- lapply(files, function(x) {
body <- read_file(x)
})
email <- unlist(email)
data <- as.data.frame(email)
data$tag <- tag
return (data)
}
ham_data <- dfun(ham_directory, tag="ham")
spam_data <- dfun(spam_directory, tag="spam")
dfun <- rbind(ham_data, spam_data)
table(dfun$tag)
##
## ham spam
## 2551 1397
df<-dfun %>%
mutate(email = str_remove_all(email, pattern = "<.*?>")) %>%
mutate(email = str_remove_all(email, pattern = "[:digit:]")) %>%
mutate(email = str_remove_all(email, pattern = "[:punct:]")) %>%
mutate(email = str_remove_all(email, pattern = "[\n]")) %>%
mutate(email = str_to_lower(email)) %>%
unnest_tokens(output=text,input=email,
token="paragraphs",
format="text") %>%
anti_join(stop_words, by=c("text"="word"))
corpus <- VCorpus(VectorSource(df$text))
corpus <- tm_map(corpus, content_transformer(stringi::stri_trans_tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
shuffled <- sample(nrow(df))
df<-df[shuffled,]
df$tag <- as.factor(df$tag)
#Matrix from our data frame
dtmatrix <- DocumentTermMatrix(corpus, control =
list(stemming = TRUE))
dtmatrix <- removeSparseTerms(dtmatrix, 0.999)
#contract document Term Matrix using DocumentTermMatrix()
convert_count_fun <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c(0,1))
y
}
tmp <- apply(dtmatrix, 2, convert_count_fun)
df_matrix = as.data.frame(as.matrix(tmp))
df_matrix$class = df_matrix$class
str(df_matrix$class)
## chr [1:3948] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
#Prediction
prediction <- createDataPartition(df_matrix$class, p=.7, list = FALSE, times = 1)
head(prediction)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 4
## [4,] 5
## [5,] 6
## [6,] 7
training <- df[prediction,]
testing <- df[-prediction,]
classifier <- randomForest(x = training, y = training$tag, ntree = 300)
predicted <- predict(classifier, newdata = testing)
confusionMatrix(table(predicted,testing$tag))
## Confusion Matrix and Statistics
##
##
## predicted ham spam
## ham 760 0
## spam 0 423
##
## Accuracy : 1
## 95% CI : (0.9969, 1)
## No Information Rate : 0.6424
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6424
## Detection Rate : 0.6424
## Detection Prevalence : 0.6424
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : ham
##
Conclusion: In the project, I learn how to read the text from the email and transform it to the data frame. Also, I learned how use the data frame to predicting spam.