Assignment
————————————————————————————————————
————————————————————————————————————
Library Definition
library(tidyverse)
library(stringr)
library(knitr)
library(R.utils)
library(tm)
library(wordcloud)
library(topicmodels)
library(SnowballC)
library(e1071)
library(stats)
Sources for ham and spam
url.spam <- "http://spamassassin.apache.org/old/publiccorpus/"
file.spam <- "20050311_spam_2.tar.bz2"
url.ham <- "http://spamassassin.apache.org/old/publiccorpus/"
file.ham <- "20030228_easy_ham.tar.bz2"Function to download
downloadTAR <- function(filetype=NULL, myurl=NULL, myrootfile=NULL){
destfile <- paste(filetype,".tar", sep="")
if(!file.exists(destfile)){
myfile <- paste(myurl,myrootfile,sep="")
destfile <- paste(filetype,".tar.bz2", sep="")
download.file(myfile, destfile= destfile)
bunzip2(destfile)
# untar(destfile)
}
mycompresedfilenames <- untar(destfile, list = TRUE)
return(mycompresedfilenames)
}
spamFileNames <- downloadTAR("Spam", url.spam, file.spam)
hamFileNames <- downloadTAR("Ham", url.ham, file.ham)Cleanup filenames and take subset of files where names contain only 38 character
spamfiles <- str_trim(str_replace_all(spamFileNames, "spam_2/", ""))
hamFiles <- str_trim(str_replace_all(hamFileNames, "easy_ham/", ""))
spamfiles <- subset(spamfiles, nchar(spamfiles) == 38)
hamfiles <- subset(hamFiles , nchar(hamFiles) == 38)
summary(spamfiles); summary(hamfiles) ## Length Class Mode
## 1396 character character
## Length Class Mode
## 2500 character character
Read 2500 hamfiles and 1396 spamfiles
readFileContents <- function(importtype=NULL, filenames=NULL){
if (importtype == "Spam") {
filecon <- paste("C:\\Users\\james\\Desktop\\Education\\MS Data Analytics - CUNY\\607- Data Acquisition and Management\\Project4\\spam_2\\",filenames, sep = "")
}
if (importtype == "Ham") {
filecon <- paste("C:\\Users\\james\\Desktop\\Education\\MS Data Analytics - CUNY\\607- Data Acquisition and Management\\Project4\\easy_ham\\",filenames, sep = "")
}
temp <- data.frame(stringsAsFactors = FALSE)
mydata <- matrix()
for(i in 1:length(filenames)){
conn <- file(filecon[i], "r", blocking = FALSE)
temp <- readLines(conn)
close(conn)
temp <- str_c(temp, collapse = "")
temp <- as.data.frame(temp, stringsAsFactors = FALSE)
names(temp) <- "Content"
mydata[[i]] <- temp
}
return(mydata)
}
spams <- readFileContents("Spam", spamfiles)
hams <- readFileContents("Ham", hamfiles)Create 2 Data frames spams_df and hams_df
# Create Character Vector from Dataframe
temp <- as.character()
for (i in 1:length(spams)){
temp[i]<- as.character(spams[[i]])
}
finalspams <- temp
rm(temp)
temp1<-as.character()
for (i in 1:length(hams)){
temp1[i]<- as.character(hams[[i]])
}
finalhams <- temp1
rm(temp1)
spams_df <- data.frame(finalspams, stringsAsFactors = FALSE)
hams_df <- data.frame(finalhams, stringsAsFactors = FALSE)
spams_df$type <- "Spams"
hams_df$type <- "Hams"
spams_df$file <- spamfiles
hams_df$file <- hamfiles
#reorder by column index
spams_df <- spams_df[c(2,3,1)]
hams_df <- hams_df[c(2,3,1)]
names(spams_df) <- c("type","file","Content")
names(hams_df) <- c("type","file","Content")Combining the two dataframes into one
emails_df <- bind_rows(spams_df, hams_df)
# Create Character Vector from Dataframe
finalspamsTotalEmails <- dim(spams_df)[1]
finalhamsTotalEmails <- dim(hams_df)[1]Some results
The total number of known spams are: 1396
The total number of known hams are: 2500
Grand total of Emails: 3896
Analysis
Length of Emails
spamsLength <- nchar(spams_df$Content)
hamsLength <- nchar(hams_df$Content)Spams Statistics
summary(spamsLength)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 725 2458 4004 6183 7020 89213
Distribution
hist(spamsLength, main="Spams Length Frequency", xlab="Length of Emails", breaks = 100)Hams Statistics
Summary
summary(hamsLength)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 355 1644 3081 3364 4039 88593
Distribution
hist(hamsLength, main="Hams Length Frequency", xlab="Length of Emails", breaks = 100)Median Length
spamsMedian <- median(spamsLength)
hamsMedian <- median(hamsLength)
medianDiff <- spamsMedian - hamsMedian
medianPercentile <- round(((spamsMedian / hamsMedian) - 1) * 100,2)By running this analysis we can find out that in our pool of known ham spam emails; the Spam emails tend to have a longer Median length compared to Ham emails; that is as follows:
Median Length of Spams: 4004.
Median Length of Hams: 3081.
Difference of medians: 923.
Percentage difference: 29.96.
Using tm Package - Remove numbers,punctuation, stop words, extra white space etc
sms_corpus <- Corpus(VectorSource(emails_df$Content))
#translate all letters to lower case
clean_corpus<- tm_map(sms_corpus, content_transformer(tolower))
# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
#inspect(clean_corpus[1:3])
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
# remove extra white spaces
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
# Stem
release_corpus <- tm_map(clean_corpus, content_transformer(stemDocument))
# Indices
spam_indices <- which(emails_df$type == "Spams")
ham_indices <- which(emails_df$type == "Hams")Wordclouds
Spam
# Word Cloud
suppressMessages(suppressWarnings(wordcloud(clean_corpus[spam_indices], min.freq=250)))Ham
# Word Cloud
suppressMessages(suppressWarnings(wordcloud(clean_corpus[ham_indices], min.freq=250)))Training data
Divide corpus into training and test data
Use 75% training and 25% test.
# Randomize emails order
random_emails <- emails_df[sample(nrow(emails_df)),]
NEmailsQ <- dim(random_emails)[1]/4*3
NEmails <- dim(random_emails)[1]
random_emails_train <- random_emails[1:NEmailsQ,]
random_emails_test <- random_emails[NEmailsQ+1:NEmails,]
# Document-term matrix and clean corpus
emails_corpus_train <- clean_corpus[1:NEmailsQ]
emails_corpus_test <- clean_corpus[NEmailsQ+1:NEmails]
# Text to Matrix in order to Tokenize the corpus
emails_dtm_train <- DocumentTermMatrix(emails_corpus_train)
emails_dtm_train <- removeSparseTerms(emails_dtm_train, 1-(10/length(release_corpus)))
emails_dtm_test <- DocumentTermMatrix(emails_corpus_test)
emails_dtm_test <- removeSparseTerms(emails_dtm_test, 1-(10/length(release_corpus)))
emails_tdm_train <- TermDocumentMatrix(emails_corpus_train)
emails_tdm_train <- removeSparseTerms(emails_tdm_train, 1-(10/length(release_corpus)))
emails_tdm_test <- TermDocumentMatrix(emails_corpus_test)
emails_tdm_test <- removeSparseTerms(emails_tdm_test, 1-(10/length(release_corpus)))
five_times_words <- findFreqTerms(emails_dtm_train, 5)Create document-term matrices using frequent words
emails_train <- DocumentTermMatrix(emails_corpus_train, control=list(dictionary = five_times_words))
emails_test <- DocumentTermMatrix(emails_corpus_test, control=list(dictionary = five_times_words))Convert count information to “Yes”, “No”
Naive Bayes classification needs present or absent info on each word in a message. We have counts of occurrences. Convert the document-term matrices.
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}emails_train <- apply(emails_train, 2, convert_count)
emails_test <- apply(emails_test, 2, convert_count)The Naive Bayes function
We’ll use a Naive Bayes classifier provided in the package e1071.
emails_classifier <- naiveBayes(emails_train, factor(random_emails_train$type))emails_test_pred <- predict(emails_classifier, newdata=emails_test)
summary(emails_test_pred)## Hams Spams
## 298 3598