Fill a corpus with the data
Spam_Dir <- DirSource("spam_2")
#Spam_c <- Corpus(Spam_Dir, readerControl = list(readPlain))
Spam_c <- VCorpus(Spam_Dir)
meta(Spam_c, "type") <- "Spam"
Ham_Dir <- DirSource("easy_ham")
Ham_c <- VCorpus(Ham_Dir)
meta(Ham_c, "type") <- "Ham"
Create Term Document Matrices
options <- list(removeNumbers=TRUE, removePunctuation=TRUE,tolower=TRUE,stopwords=TRUE,stripWhitespace=TRUE,minWordLength=2,maxWordLength=25)
TDM_spam <- TermDocumentMatrix(Spam_c,control = options)
TDM_spam <- removeSparseTerms(TDM_spam,(1-(10/length(Spam_c))))
TDM_Ham <- TermDocumentMatrix(Ham_c,control = options)
TDM_Ham <- removeSparseTerms(TDM_Ham,(1-(10/length(Ham_c))))
term_Spam_df <- as.data.frame(as.table(TDM_spam))
names(term_Spam_df) <- c("Term", "Spam_Doc", "Spam_Freq")
term_Spam_df$Spam_Doc <- "Spam"
term_Spam_df$Spam_Freq[is.na(term_Spam_df$Spam_Freq)] <- 0
term_Spam_df <- ddply(term_Spam_df, .(Term, Spam_Doc), summarize, Spam_Freq = sum(as.numeric(Spam_Freq)))
term_Ham_df <- as.data.frame(as.table(TDM_Ham))
names(term_Ham_df) <- c("Term", "Ham_Doc", "Ham_Freq")
term_Ham_df$Ham_Doc <- "Ham"
term_Ham_df$Ham_Freq[is.na(term_Ham_df$Ham_Freq)] <- 0
term_Ham_df <- ddply(term_Ham_df, .(Term, Ham_Doc), summarize, Ham_Freq = sum(as.numeric(Ham_Freq)))
Combine the two dataframes and calculate the difference between the frequency of each term in the ham files and in the spam files. Limit the final Scoring dataframe to the terms found more commonly in spam documents (specifically those with a count of more than 100 more occurences in spam)
Combo_df <- merge(term_Ham_df,term_Spam_df, by = "Term")
Combo_df$Delta <- Combo_df$Ham_Freq - Combo_df$Spam_Freq
Scoring_df <- Combo_df[order(Combo_df$Delta),]
Scoring_df <- Scoring_df[as.numeric(Scoring_df$Delta) < -100,]
Develop a scoring scheme to determine the likelyhood of a document being SPAM
Is_It_Spam <- function(filename){
text <- paste(readLines(filename), collapse = ' ')
word_df <- as.data.frame(strsplit(text, "\\W+"))
names(word_df) <- "Text"
word_df$Text <- tolower(word_df$Text)
nWords <- nrow(word_df)
word_df$Delta <- 0
#word_df <- word_df[!duplicated(word_df),]
#print(word_df)
for (i in c(1:nWords)){
#print(i)
#print(word_df[i,"Text"])
wordRow <- which(Scoring_df$Term == word_df[i,"Text"])
#print(wordRow)
if (!(length(wordRow) == 0)){
word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
}
}
print(mean(word_df$Delta))
}
Run several tests to determine a threshold between spam/ham for this equation.
Is_It_Spam("spam_2/00005.ed0aba4d386c5e62bc737cf3f0ed9589")
## [1] -99.78102
Is_It_Spam("spam_2/00011.bd8c904d9f7b161a813d222230214d50")
## [1] -108.2053
Is_It_Spam("spam_2/01237.245ac1766016b756a3ddb3b463cc9645")
## [1] -41.63551
Is_It_Spam("spam_2/00014.13574737e55e51fe6737a475b88b5052")
## [1] -24.72703
Is_It_Spam("spam_2/00663.4baa9521293a04306b038be1f65d4471")
## [1] -5.683544
Is_It_Spam("spam_2/01040.24856bbcaedd4d7b28eae47d8f89a62f")
## [1] -61.06009
Is_It_Spam("easy_ham/00010.145d22c053c1a0c410242e46c01635b3")
## [1] -12.45862
Is_It_Spam("easy_ham/00916.1ea7a40e892220d43795fee49ab4849e")
## [1] -7.327492
Is_It_Spam("easy_ham/02462.4f93bc374730a117b103d5c3a2d699f6")
## [1] -7.111332
Is_It_Spam("easy_ham/02467.b05b3925df99da2a9426f07833ce5a9d")
## [1] -7.660173
Is_It_Spam("easy_ham/02475.9277ee243e3f51fa53ed6be55798d360")
## [1] -3.173554
Is_It_Spam("easy_ham/01183.36c29b5d6d19a328c9928a157019a49c")
## [1] -14.01753
Adjust the scoring scheme based on the threshold settled on above to state whether a document is determined to be spam.
Is_It_Spam <- function(filename){
if (is.factor(filename)){
filename <- as.character(filename)
}
text <- paste(readLines(filename), collapse = ' ')
word_df <- as.data.frame(strsplit(text, "\\W+"))
names(word_df) <- "Text"
word_df$Text <- tolower(word_df$Text)
nWords <- nrow(word_df)
word_df$Delta <- 0
#word_df <- word_df[!duplicated(word_df),]
#print(word_df)
for (i in c(1:nWords)){
#print(i)
#print(word_df[i,"Text"])
wordRow <- which(Scoring_df$Term == word_df[i,"Text"])
#print(wordRow)
if (!(length(wordRow) == 0)){
word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
}
}
#print(mean(word_df$Delta))
if (mean(word_df$Delta) < -25) {
return("SPAM")
} else{
return("HAM")
}
}
Test on all of the Spam and Ham documents.
testing_Spam <- data.frame(Spam_Files)
testing_Spam$Spam_Files <- as.character(testing_Spam$Spam_Files)
testing_Spam$Spam_Files <- str_replace(testing_Spam$Spam_Files,"\\\\","/")
testing_Ham <- data.frame(Ham_Files)
testing_Ham$Ham_Files <- as.character(testing_Ham$Ham_Files)
testing_Ham$Ham_Files <- str_replace(testing_Ham$Ham_Files,"\\\\","/")
testing_Spam$Test <- sapply(testing_Spam$Spam_Files,Is_It_Spam)
testing_Ham$Test <- sapply(testing_Ham$Ham_Files,Is_It_Spam)
Summarize results for spam and ham
print("Marking of the Spam files")
## [1] "Marking of the Spam files"
count(testing_Spam$Test)
## x freq
## 1 HAM 80
## 2 SPAM 1317
print("Marking of the Ham Files")
## [1] "Marking of the Ham Files"
count(testing_Ham$Test)
## x freq
## 1 HAM 2434
## 2 SPAM 67
Based on these counts, it is evident that an adjustment would need to be made to the equation. While missing 80 out of 1397 (5.7%) of spam emails and showing them in a customers inbox may be acceptable (though this is still fairly high), wrongly labeling 67 out of 2501 (2.7%) good emails as spam is unacceptable. In order to create a valid scheme for identifying spam, I would suggest looking at pairings of the words (in addition to their frequency). The equation should also be weighted in favor of the clean emails in order to avoid mistakenly labelling something as spam when it is not. Finally, adding a function that looks at the sender would help greatly with the filtering of spam/ham.