In order to perform sentimental analysis I implemented the following:
library(tm)
library(plyr)
library(dplyr)
library(tidyr)
Downloaded ham and spam corpuses from http://spamassassin.apache.org/old/publiccorpus/ and save on local machine.
Built function that cleans corpuses.
clean_corpus<-function(corpus){
#removed punctuation
corpus_data<-tm_map(corpus,removePunctuation)
#removed white spaces
corpus_data<-tm_map(corpus,stripWhitespace)
#converted to lower case
corpus_data<-tm_map(corpus,tolower)
#removed numbers
corpus_data<-tm_map(corpus,removeNumbers)
#removed english common stopwords
#corpus_data<-tm_map(corpus, removeWords, stopwords('english'))
return(corpus_data)
}
build_tmd<-function(type,path){
dir <- sprintf("%s/%s",path,type)
cor <- Corpus(DirSource(directory=dir))
#implement clean_corpus function
corp_clean <- clean_corpus(cor)
tmd <- TermDocumentMatrix(corp_clean)
#remove sparse words
tdm <- removeSparseTerms(tmd,0.7)
}
#load ham corpuses
ham <- build_tmd("ham","/Users/olga/desktop/project4/")
#create ham data frame
ham_data <- as.data.frame(as.table(ham))
#load spam corpuses
spam <- build_tmd("spam","/Users/olga/desktop/project4/")
#create spam data frame
spam_data <- as.data.frame(as.table(spam))
#create vector that holds all ham documents
ham_docs <- c(levels(ham_data$Docs))
#shuffle a vector's elements
ham_docs <- sample(ham_docs)
#create 66% training data set for ham
ham_training <- ham_data %>% filter(Docs %in% ham_docs[1:round(0.66*length(ham_docs))])
#create 34% testing data set for ham
ham_testing <- ham_data %>% filter(Docs %in% ham_docs[((round(0.66*length(ham_docs)))+1):length(ham_docs)])
#create vector that holds all spam documents
spam_docs <- c(levels(spam_data$Docs))
#shuffle a vector's elements
spam_docs <- sample(spam_docs)
#create 66% training data set for spam
spam_training <- spam_data %>% filter(Docs %in% spam_docs[1:round(0.66*length(spam_docs))])
#create 34% testing data set for spam
spam_testing <- spam_data %>% filter(Docs %in% spam_docs[((round(0.66*length(spam_docs)))+1):length(spam_docs)])
#merge repeating words and add type column for both training data sets
ham_training <- ham_training %>% select(Terms,Freq) %>% group_by(Terms) %>% summarise(Freq=sum(Freq)) %>% mutate(Type="ham")
spam_training <- spam_training %>% select(Terms,Freq) %>% group_by(Terms) %>% summarise(Freq=sum(Freq)) %>% mutate(Type="spam")
#merge ham and spam training data frames
ham_spam_training <- rbind(ham_training,spam_training)
head(ham_spam_training)
## # A tibble: 6 x 3
## Terms Freq Type
## <fctr> <dbl> <chr>
## 1 admin 3578 ham
## 2 all 595 ham
## 3 and 3823 ham
## 4 archive 568 ham
## 5 are 1045 ham
## 6 ascii 535 ham
#convert long format to wide format
ham_spam_training <- spread(ham_spam_training,Type,Freq)
#replace NAs with 0s
ham_spam_training[is.na(ham_spam_training)] <- 0
head(ham_spam_training)
## # A tibble: 6 x 3
## Terms ham spam
## <fctr> <dbl> <dbl>
## 1 admin 3578 0
## 2 all 595 1294
## 3 and 3823 6941
## 4 archive 568 0
## 5 are 1045 1956
## 6 ascii 535 0
#add goodness and badness columns
ham_spam_training <- ham_spam_training %>% mutate(goodness=ham/(ham+spam),badness=spam/(ham+spam))
head(ham_spam_training)
## # A tibble: 6 x 5
## Terms ham spam goodness badness
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 admin 3578 0 1.0000000 0.0000000
## 2 all 595 1294 0.3149815 0.6850185
## 3 and 3823 6941 0.3551654 0.6448346
## 4 archive 568 0 1.0000000 0.0000000
## 5 are 1045 1956 0.3482173 0.6517827
## 6 ascii 535 0 1.0000000 0.0000000
#add actual type column to both testing types
ham_testing <- ham_data %>% mutate(Actual_Type="ham")
spam_testing <- spam_data %>% mutate(Actual_Type="spam")
#merge ham and spam testing data frames
ham_spam_testing <- rbind(ham_testing,spam_testing)
#replace NAs with 0s
ham_spam_testing[is.na(ham_spam_testing)] <- 0
head(ham_spam_testing)
## Terms Docs Freq Actual_Type
## 1 admin 00001.1a31cc283af0060967a233d26548a6ce 3 ham
## 2 all 00001.1a31cc283af0060967a233d26548a6ce 6 ham
## 3 and 00001.1a31cc283af0060967a233d26548a6ce 11 ham
## 4 archive 00001.1a31cc283af0060967a233d26548a6ce 1 ham
## 5 are 00001.1a31cc283af0060967a233d26548a6ce 2 ham
## 6 ascii 00001.1a31cc283af0060967a233d26548a6ce 1 ham
ham_spam_merged <- merge(x = ham_spam_testing, y = ham_spam_training, by = "Terms", all.x = TRUE)
#order by Docs
ham_spam_merged <- ham_spam_merged %>% arrange(Docs) %>% select(-ham,-spam) %>% mutate(goodness_sum=Freq*goodness,badness_sum=Freq*badness)
head(ham_spam_merged)
## Terms Docs Freq Actual_Type
## 1 about 00001.1a31cc283af0060967a233d26548a6ce 0 ham
## 2 admin 00001.1a31cc283af0060967a233d26548a6ce 3 ham
## 3 agent 00001.1a31cc283af0060967a233d26548a6ce 0 ham
## 4 all 00001.1a31cc283af0060967a233d26548a6ce 6 ham
## 5 and 00001.1a31cc283af0060967a233d26548a6ce 11 ham
## 6 archive 00001.1a31cc283af0060967a233d26548a6ce 1 ham
## goodness badness goodness_sum badness_sum
## 1 1.0000000 0.0000000 0.000000 0.000000
## 2 1.0000000 0.0000000 3.000000 0.000000
## 3 1.0000000 0.0000000 0.000000 0.000000
## 4 0.3149815 0.6850185 1.889889 4.110111
## 5 0.3551654 0.6448346 3.906819 7.093181
## 6 1.0000000 0.0000000 1.000000 0.000000
#calculate overall goodness and badness scores for each document
ham_spam_merged <- ham_spam_merged %>% group_by(Docs, Actual_Type) %>% summarise(Sum_Goodness=sum(goodness_sum),Sum_Badness=sum(badness_sum))
head(ham_spam_merged)
## # A tibble: 6 x 4
## # Groups: Docs [6]
## Docs Actual_Type Sum_Goodness
## <fctr> <chr> <dbl>
## 1 00001.1a31cc283af0060967a233d26548a6ce ham 278.3786
## 2 00002.5a587ae61666c5aa097c8e866aedcc59 ham 222.9847
## 3 00003.19be8acd739ad589cd00d8425bac7115 ham 225.7593
## 4 00004.b2ed6c3c62bbdfab7683d60e214d1445 ham 312.9193
## 5 00005.07b9d4aa9e6c596440295a5170111392 ham 182.5798
## 6 00006.654c4ec7c059531accf388a807064363 ham 311.5902
## # ... with 1 more variables: Sum_Badness <dbl>
#find test type
ham_spam_merged <- ham_spam_merged %>% mutate(Test_Type = ifelse(Sum_Goodness > Sum_Badness, "ham","spam")) %>% select(Docs,Actual_Type,Test_Type)
head(ham_spam_merged)
## # A tibble: 6 x 3
## # Groups: Docs [6]
## Docs Actual_Type Test_Type
## <fctr> <chr> <chr>
## 1 00001.1a31cc283af0060967a233d26548a6ce ham ham
## 2 00002.5a587ae61666c5aa097c8e866aedcc59 ham ham
## 3 00003.19be8acd739ad589cd00d8425bac7115 ham ham
## 4 00004.b2ed6c3c62bbdfab7683d60e214d1445 ham ham
## 5 00005.07b9d4aa9e6c596440295a5170111392 ham ham
## 6 00006.654c4ec7c059531accf388a807064363 ham ham
#find all documents which test type matches actual type
test_equals_act <- subset(ham_spam_merged,Actual_Type = ham_spam_merged)
#calculate accuracy
accuracy <- nrow(test_equals_act)/nrow(ham_spam_merged)
accuracy
## [1] 1