Project 4

In order to perform sentimental analysis I implemented the following:

Installed necessary packages.

library(tm)
library(plyr)
library(dplyr)
library(tidyr)

Downloaded ham and spam corpuses from http://spamassassin.apache.org/old/publiccorpus/ and save on local machine.
Built function that cleans corpuses.

clean_corpus<-function(corpus){
  #removed punctuation
  corpus_data<-tm_map(corpus,removePunctuation)
  #removed white spaces
  corpus_data<-tm_map(corpus,stripWhitespace)
  #converted to lower case
  corpus_data<-tm_map(corpus,tolower)
  #removed numbers
  corpus_data<-tm_map(corpus,removeNumbers)
  #removed english common stopwords
  #corpus_data<-tm_map(corpus, removeWords, stopwords('english'))
  return(corpus_data)
}

Created function that builds terms document matrix.

build_tmd<-function(type,path){
  dir <- sprintf("%s/%s",path,type)
  cor <- Corpus(DirSource(directory=dir))
  #implement clean_corpus function
  corp_clean <- clean_corpus(cor)
  tmd <- TermDocumentMatrix(corp_clean)
  #remove sparse words
  tdm <- removeSparseTerms(tmd,0.7)
}

Created “ham” and “spam” data frames.

#load ham corpuses
ham <- build_tmd("ham","/Users/olga/desktop/project4/")
#create ham data frame
ham_data <- as.data.frame(as.table(ham))

#load spam corpuses
spam <- build_tmd("spam","/Users/olga/desktop/project4/")
#create spam data frame
spam_data <- as.data.frame(as.table(spam))

Created training(66%) and testing(34%) data sets for “ham” and “spam” data sets.

#create vector that holds all ham documents
ham_docs <- c(levels(ham_data$Docs))
#shuffle a vector's elements
ham_docs <- sample(ham_docs)
#create 66% training data set for ham
ham_training <- ham_data %>% filter(Docs %in% ham_docs[1:round(0.66*length(ham_docs))])
#create 34% testing data set for ham
ham_testing <- ham_data %>% filter(Docs %in% ham_docs[((round(0.66*length(ham_docs)))+1):length(ham_docs)])

#create vector that holds all spam documents
spam_docs <- c(levels(spam_data$Docs))
#shuffle a vector's elements
spam_docs <- sample(spam_docs)
#create 66% training data set for spam
spam_training <- spam_data %>% filter(Docs %in% spam_docs[1:round(0.66*length(spam_docs))])
#create 34% testing data set for spam
spam_testing <- spam_data %>% filter(Docs %in% spam_docs[((round(0.66*length(spam_docs)))+1):length(spam_docs)])

Calculated words frequencies.

#merge repeating words and add type column for both training data sets
ham_training <- ham_training %>% select(Terms,Freq) %>% group_by(Terms) %>% summarise(Freq=sum(Freq)) %>% mutate(Type="ham")

spam_training <- spam_training %>% select(Terms,Freq) %>% group_by(Terms) %>% summarise(Freq=sum(Freq)) %>% mutate(Type="spam")

#merge ham and spam training data frames
ham_spam_training <- rbind(ham_training,spam_training)

head(ham_spam_training)

## # A tibble: 6 x 3
##     Terms  Freq  Type
##    <fctr> <dbl> <chr>
## 1   admin  3578   ham
## 2     all   595   ham
## 3     and  3823   ham
## 4 archive   568   ham
## 5     are  1045   ham
## 6   ascii   535   ham

Calculated goodness and badness of the words.

#convert long format to wide format
ham_spam_training <- spread(ham_spam_training,Type,Freq)

#replace NAs with 0s
ham_spam_training[is.na(ham_spam_training)] <- 0

head(ham_spam_training)

## # A tibble: 6 x 3
##     Terms   ham  spam
##    <fctr> <dbl> <dbl>
## 1   admin  3578     0
## 2     all   595  1294
## 3     and  3823  6941
## 4 archive   568     0
## 5     are  1045  1956
## 6   ascii   535     0

#add goodness and badness columns
ham_spam_training <- ham_spam_training %>% mutate(goodness=ham/(ham+spam),badness=spam/(ham+spam))

head(ham_spam_training)

## # A tibble: 6 x 5
##     Terms   ham  spam  goodness   badness
##    <fctr> <dbl> <dbl>     <dbl>     <dbl>
## 1   admin  3578     0 1.0000000 0.0000000
## 2     all   595  1294 0.3149815 0.6850185
## 3     and  3823  6941 0.3551654 0.6448346
## 4 archive   568     0 1.0000000 0.0000000
## 5     are  1045  1956 0.3482173 0.6517827
## 6   ascii   535     0 1.0000000 0.0000000

Merged “spam” and “ham” testing data sets.

#add actual type column to both testing types
ham_testing <- ham_data %>% mutate(Actual_Type="ham")
spam_testing <- spam_data %>% mutate(Actual_Type="spam")

#merge ham and spam testing data frames
ham_spam_testing <- rbind(ham_testing,spam_testing) 

#replace NAs with 0s
ham_spam_testing[is.na(ham_spam_testing)] <- 0

head(ham_spam_testing)

##     Terms                                   Docs Freq Actual_Type
## 1   admin 00001.1a31cc283af0060967a233d26548a6ce    3         ham
## 2     all 00001.1a31cc283af0060967a233d26548a6ce    6         ham
## 3     and 00001.1a31cc283af0060967a233d26548a6ce   11         ham
## 4 archive 00001.1a31cc283af0060967a233d26548a6ce    1         ham
## 5     are 00001.1a31cc283af0060967a233d26548a6ce    2         ham
## 6   ascii 00001.1a31cc283af0060967a233d26548a6ce    1         ham

Merged “ham_spam” training and testing data sets.

ham_spam_merged <- merge(x = ham_spam_testing, y = ham_spam_training, by = "Terms", all.x = TRUE)

#order by Docs
ham_spam_merged <- ham_spam_merged %>% arrange(Docs)  %>% select(-ham,-spam) %>% mutate(goodness_sum=Freq*goodness,badness_sum=Freq*badness)
head(ham_spam_merged)

##     Terms                                   Docs Freq Actual_Type
## 1   about 00001.1a31cc283af0060967a233d26548a6ce    0         ham
## 2   admin 00001.1a31cc283af0060967a233d26548a6ce    3         ham
## 3   agent 00001.1a31cc283af0060967a233d26548a6ce    0         ham
## 4     all 00001.1a31cc283af0060967a233d26548a6ce    6         ham
## 5     and 00001.1a31cc283af0060967a233d26548a6ce   11         ham
## 6 archive 00001.1a31cc283af0060967a233d26548a6ce    1         ham
##    goodness   badness goodness_sum badness_sum
## 1 1.0000000 0.0000000     0.000000    0.000000
## 2 1.0000000 0.0000000     3.000000    0.000000
## 3 1.0000000 0.0000000     0.000000    0.000000
## 4 0.3149815 0.6850185     1.889889    4.110111
## 5 0.3551654 0.6448346     3.906819    7.093181
## 6 1.0000000 0.0000000     1.000000    0.000000

Performed sentiment analysis.

#calculate overall goodness and badness scores for each document
ham_spam_merged <- ham_spam_merged %>% group_by(Docs, Actual_Type) %>%  summarise(Sum_Goodness=sum(goodness_sum),Sum_Badness=sum(badness_sum)) 
head(ham_spam_merged)

## # A tibble: 6 x 4
## # Groups:   Docs [6]
##                                     Docs Actual_Type Sum_Goodness
##                                   <fctr>       <chr>        <dbl>
## 1 00001.1a31cc283af0060967a233d26548a6ce         ham     278.3786
## 2 00002.5a587ae61666c5aa097c8e866aedcc59         ham     222.9847
## 3 00003.19be8acd739ad589cd00d8425bac7115         ham     225.7593
## 4 00004.b2ed6c3c62bbdfab7683d60e214d1445         ham     312.9193
## 5 00005.07b9d4aa9e6c596440295a5170111392         ham     182.5798
## 6 00006.654c4ec7c059531accf388a807064363         ham     311.5902
## # ... with 1 more variables: Sum_Badness <dbl>

#find test type
ham_spam_merged <- ham_spam_merged %>% mutate(Test_Type = ifelse(Sum_Goodness > Sum_Badness, "ham","spam")) %>% select(Docs,Actual_Type,Test_Type) 
head(ham_spam_merged)

## # A tibble: 6 x 3
## # Groups:   Docs [6]
##                                     Docs Actual_Type Test_Type
##                                   <fctr>       <chr>     <chr>
## 1 00001.1a31cc283af0060967a233d26548a6ce         ham       ham
## 2 00002.5a587ae61666c5aa097c8e866aedcc59         ham       ham
## 3 00003.19be8acd739ad589cd00d8425bac7115         ham       ham
## 4 00004.b2ed6c3c62bbdfab7683d60e214d1445         ham       ham
## 5 00005.07b9d4aa9e6c596440295a5170111392         ham       ham
## 6 00006.654c4ec7c059531accf388a807064363         ham       ham

#find all documents which test type matches actual type
test_equals_act <- subset(ham_spam_merged,Actual_Type = ham_spam_merged)

#calculate accuracy
accuracy <- nrow(test_equals_act)/nrow(ham_spam_merged)

accuracy

## [1] 1

Project 4

Olya Fomicheva

11/5/2017