It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
require(tm)
require(SnowballC)
require(wordcloud)
require(e1071)
ham_dir<- setwd('C:/Users/MARYL/OneDrive/Desktop/easy_ham_2/')
hamFileNames = list.files(ham_dir)
# Documents spam and ham
ham_docs<- NA
for(i in 1:length(hamFileNames))
{
filepath<-paste0(ham_dir, "/", hamFileNames[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham_docs= c(ham_docs,list1)
}
ham_df <-as.data.frame(unlist(ham_docs),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")
spam_dir<-setwd("C:/Users/MARYL/OneDrive/Desktop/spam_2/")
spamFileNames = list.files(spam_dir)
spam_docs <- NA
for(i in 1:length(spamFileNames))
{
filepath<-paste0(spam_dir, "/", spamFileNames[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
spam_docs = c(spam_docs,list1)
}
spam_df <-as.data.frame(unlist(spam_docs),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")
spam_ham_df <- rbind(ham_df, spam_df)
# Write CSV so that it can read in from anywhere
write.csv(spam_ham_df, file = "spam_ham.csv")
str(spam_ham_df)
## 'data.frame': 2804 obs. of 2 variables:
## $ text: chr NA "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
## $ type: chr "ham" "ham" "ham" "ham" ...
spam_ham_csv<-read.csv("https://raw.githubusercontent.com/Luz917/spam_ham/master/spam_ham.csv",stringsAsFactors = FALSE)
str(spam_ham_csv)
## 'data.frame': 2801 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ text: chr NA "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
## $ type: chr "ham" "ham" "ham" "ham" ...
random_spam_ham <- spam_ham_csv[sample(nrow(spam_ham_csv)),]
str(random_spam_ham)
## 'data.frame': 2801 obs. of 3 variables:
## $ X : int 1864 241 2251 1065 1958 1317 2285 2498 1671 1473 ...
## $ text: chr "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
## $ type: chr "spam" "ham" "spam" "ham" ...
sms_corpus <- Corpus(VectorSource(random_spam_ham$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 2801
#translate all letters to lower case
clean_corpus <- tm_map(sms_corpus, tolower)
clean_corpus<- tm_map(sms_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>%
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
## remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
## remove whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
sms_dtm <- DocumentTermMatrix(clean_corpus)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 556)>>
## Non-/sparse entries: 887448/669908
## Sparsity : 43%
## Maximal term length: 15
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs com exmh invoked line list org spamassassin taint within workers
## 1 18 33 25 23 17 24 22 22 25 22
## 10 18 33 25 23 17 24 22 22 25 22
## 12 18 33 25 23 17 24 22 22 25 22
## 13 18 33 25 23 17 24 22 22 25 22
## 14 18 33 25 23 17 24 22 22 25 22
## 3 18 33 25 23 17 24 22 22 25 22
## 5 18 33 25 23 17 24 22 22 25 22
## 7 18 33 25 23 17 24 22 22 25 22
## 8 18 33 25 23 17 24 22 22 25 22
## 9 18 33 25 23 17 24 22 22 25 22
sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity : 0%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug com esmtp list localhost message one org received within
## 1 12 18 6 17 9 10 7 24 10 25
## 10 12 18 6 17 9 10 7 24 10 25
## 12 12 18 6 17 9 10 7 24 10 25
## 13 12 18 6 17 9 10 7 24 10 25
## 14 12 18 6 17 9 10 7 24 10 25
## 3 12 18 6 17 9 10 7 24 10 25
## 5 12 18 6 17 9 10 7 24 10 25
## 7 12 18 6 17 9 10 7 24 10 25
## 8 12 18 6 17 9 10 7 24 10 25
## 9 12 18 6 17 9 10 7 24 10 25
just_spam <- which(random_spam_ham$type == "spam")
just_spam[1:3]
## [1] 1 3 5
just_ham <- which(random_spam_ham$type == "ham")
just_ham[1:3]
## [1] 2 4 6
wordcloud(clean_corpus[just_ham], min.freq=50, max.words=100, random.order=FALSE, rot.per=0.60,
colors=c(1:4),random.color=TRUE)
wordcloud(clean_corpus[just_spam], min.freq=50,max.words=100, random.order=FALSE, rot.per=0.60,
colors=c(1:4),random.color=TRUE)
sms_raw_train <- random_spam_ham[1:1680, ]## 60% for training
sms_raw_test <- random_spam_ham[1681:2801,]## 40% for test
sms_dtm_train <- sms_dtm[1:1680, ]
sms_dtm_test <- sms_dtm[1681:2801,]
sms_corpus_train <- clean_corpus[1:1680]
sms_corpus_test <- clean_corpus[1681:2801]
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")
sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)
The R function for the Naives Bayes Classifier is e1071
Naive Bayes has the capability to assign the probability that a new sample is either spam or ham.
This is bases on Bayes Rule the analysis of frequent occurences of words and its assumption.
Here we create the Naive Bayes Classifier which is done on the training data
sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"
sms_test_pred <- predict(sms_classifier, newdata=sms_test)
table(sms_test_pred, sms_raw_test$type)
##
## sms_test_pred ham spam
## ham 547 1
## spam 0 573
Ham Accuracy
550/551
## [1] 0.9981851
Spam Accuracy
570/570
## [1] 1
Using the Naive Bayes method is one of the best methods for spam filtering. Based on the results it correctly classified 99.84% of ham and it correctly classified 100% of spam. Strangely enough though when I went based on the most frequent terms it did it incorrectly. I will run that in the bottom to show comparison.
1.Text mining example: spam filtering (This was used as a guideline) https://www3.nd.edu/~steve/computing_with_data/20_text_mining/text_mining_example.html#/
mostfreq <- findFreqTerms(sms_dtm_train, 5)
length(mostfreq)
## [1] 78
mostfreq[1:5]
## [1] "admin" "also" "ascii" "aug" "beenthere"
sms_train_freq <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary = mostfreq))
sms_test_freq <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary = mostfreq))
sms_train_freq <- apply(sms_train_freq, 2, convert_count)
sms_test_freq <- apply(sms_test_freq, 2, convert_count)
sms_classifier1 <- naiveBayes(sms_train_freq, factor(sms_raw_train$type))
sms_test_pred1 <- predict(sms_classifier1, newdata=sms_test)
table(sms_test_pred1, sms_raw_test$type)
##
## sms_test_pred1 ham spam
## ham 0 1
## spam 547 573