DATA 607 Project 4

Step 1 - Creating a dataframe with spam emails

#Assigning directories
dir="/Users/devanshu/Documents/R Datascience/SpamHam/spam"
file = list.files(dir)

# Iterate through all files in the folder and create a list of their titles
spamlist <- NA
for(i in 1:length(file))
{
  path<-paste0(dir, "/", file[i])  
  text <-readLines(path)
  tmp<- list(paste(text, collapse="\n"))
  spamlist  = c(spamlist,tmp)
}

# Create the final spam dataframe with a column for the email text and one for the classification (1=spam)
spam <-as.data.frame(unlist(spamlist),stringsAsFactors = FALSE)
spam$classification <- 1
colnames(spam)<- c('message','classification')

Step 2 - Creating a dataframe with spam emails

# Assigning directories
dir="/Users/devanshu/Documents/R Datascience/SpamHam/ham"
file = list.files(dir)

# Iterate through all files in the folder and create a list of their titles
hamlist <- NA
for(i in 1:length(file))
{
  path<-paste0(dir, "/", file[i])  
  text <-readLines(path)
  tmp<- list(paste(text, collapse="\n"))
  hamlist  = c(hamlist,tmp)
  
}

# Create the final ham dataframe with a column for the email text and one for the classification (0=ham)
ham <-as.data.frame(unlist(hamlist),stringsAsFactors = FALSE)
ham$classification <- 0
colnames(ham)<-c("message","classification")

Step 3 - Combine the ham and spam data frames to create the complete dataframe we will be working with

complete <- rbind(ham, spam)

Step 4 - Create a corpus

completecorpus <- Corpus(VectorSource(complete$message))
meta(completecorpus[[1]])

##   author       : character(0)
##   datetimestamp: 2019-11-18 02:17:23
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)

Step 5 - Clean by removing numbers,white space,punctuations. Apply regex to extract only text

tmp= tm_map(completecorpus, content_transformer(removeNumbers))

## Warning in tm_map.SimpleCorpus(completecorpus,
## content_transformer(removeNumbers)): transformation drops documents

tmp= tm_map(tmp, content_transformer(function(x) str_replace_all(x,pattern = "[[:punct:]]|\\<.+?\\>|\\t", replacement = " ")))

## Warning in tm_map.SimpleCorpus(tmp, content_transformer(function(x)
## str_replace_all(x, : transformation drops documents

tmp = tm_map(tmp, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(tmp, content_transformer(tolower)):
## transformation drops documents

tmp= tm_map(tmp, content_transformer(stripWhitespace))

## Warning in tm_map.SimpleCorpus(tmp, content_transformer(stripWhitespace)):
## transformation drops documents

clean= tm_map(tmp, content_transformer(removePunctuation))

## Warning in tm_map.SimpleCorpus(tmp,
## content_transformer(removePunctuation)): transformation drops documents

Step 6 - Create a document matrix

dtm <- DocumentTermMatrix(clean)
dtm_df=(as.data.frame(as.matrix(dtm)))
dtm_df$classification = complete$classification

Review top 50 terms in the combined dataset

# Top 50 terms in spam emails
sort(colSums(dtm_df ) ,decreasing = TRUE)[1:50]

##         the        from         com         for        with    received 
##       30102       27536       26650       20095       19967       16834 
##        list   localhost         and         net         sep       esmtp 
##       16068       15464       15155       12779       12170        9825 
##         org     example        that         you     version         aug 
##        9270        9214        7808        7573        6399        6385 
##        http        spam        this         oct     postfix        xent 
##        5968        5906        5876        5530        5473        5287 
##         ist     content       admin        date         thu     message 
##        5136        5086        4975        4904        4720        4595 
##        fork         mon         wed    jalapeno sourceforge        text 
##        4579        4559        4225        4157        4076        4041 
##   delivered         not         tue       dogma   slashnull        have 
##        4022        3701        3693        3662        3662        3580 
##         are     subject         www      return        type        your 
##        3573        3535        3370        3364        3341        3258 
##        path        exmh 
##        3199        3130

Based on the distribution above, remove words that should not impact classification

# Top 50 terms in spam emails
clean1  <- tm_map(clean, removeWords, c('received', 'the','from','com','for','with','localhost','and','esmtp','that','http'))

## Warning in tm_map.SimpleCorpus(clean, removeWords, c("received", "the", :
## transformation drops documents

Review the newly cleaned dataset

# Top 50 terms in spam emails
dtm <- DocumentTermMatrix(clean1)
dtm = removeSparseTerms(dtm, 1-(10/length(clean1)))
dtm_df=(as.data.frame(as.matrix(dtm)))
dtm_df$classification = complete$classification

# Top 50 terms in all emails
sort(colSums(dtm_df) ,decreasing = TRUE)[1:50]

##        list         net         sep         org     example         you 
##       16068       12779       12170        9270        9214        7573 
##     version         aug        spam        this         oct     postfix 
##        6399        6385        5906        5876        5530        5473 
##        xent         ist     content       admin        date         thu 
##        5287        5136        5086        4975        4904        4720 
##     message        fork         mon         wed    jalapeno sourceforge 
##        4595        4579        4559        4225        4157        4076 
##        text   delivered         not         tue       dogma   slashnull 
##        4041        4022        3701        3693        3662        3662 
##        have         are     subject         www      return        type 
##        3580        3573        3535        3370        3364        3341 
##        your        path        exmh         fri      single        mail 
##        3258        3199        3130        3035        3019        3007 
##        drop         rpm   fetchmail        yyyy        imap     mailing 
##        2922        2914        2886        2827        2783        2692 
##         but       users 
##        2666        2656

# Top 50 terms in ham emails
sort(colSums(dtm_df %>% filter(`classification` == 0)) ,decreasing = TRUE)[1:50]

##        list         net         sep     example         org        spam 
##       15287       11296       10026        8670        7601        5856 
##     version         oct        xent         aug     postfix       admin 
##        5819        5461        5287        5182        4867        4800 
##        fork         ist        date         thu sourceforge        this 
##        4579        4389        4314        4224        3967        3939 
##         you         mon     content     message         wed    jalapeno 
##        3938        3870        3833        3793        3774        3652 
##   delivered         tue        text        exmh       dogma   slashnull 
##        3527        3223        3149        3130        3118        3118 
##     subject         rpm        yyyy         not      return        have 
##        2938        2914        2827        2797        2784        2731 
##        path         are        type      single     mailing       users 
##        2673        2613        2573        2549        2540        2532 
##         fri   freshrpms        drop         www        imap         but 
##        2515        2464        2454        2446        2440        2434 
##   fetchmail         cvs 
##        2426        2370

# Top 50 terms in spam emails
sort(colSums(dtm_df %>% filter(`classification` == 1)) ,decreasing = TRUE)[1:50]

##       you       sep      your      this       org      font      nbsp 
##      3635      2144      2010      1937      1669      1585      1531 
##       net   content       aug      mail      zzzz       are       www 
##      1483      1253      1203      1140      1118       960       924 
##       not      text      will      have     sized   message    widthd 
##       904       892       863       849       805       802       798 
##      list      type       our      free       ist       mon      smtp 
##       781       768       768       748       747       689       659 
##       all     email   postfix       img   webnote   subject      date 
##       634       621       606       606       603       597       590 
##    return   version       can     money   heightd     dogma   example 
##       580       580       553       548       546       544       544 
## slashnull      path      mime       fri    colord      here       out 
##       544       526       523       520       518       512       509 
##      more 
##       507

Step 7 - Shuffle the dataset and conduct a traintest split

#training Sample with 2400 observations
train=sample(1:nrow(dtm_df),2400)
train_set=dtm_df[train,]
test_set=dtm_df[-train,]

Step 8 - Building a Decision Tree Classifier

library(rpart)

# fit the training set and predict on the test set
dt_fit=rpart(classification ~ . , data = train_set,  method="class")
ypreds<-predict(dt_fit,test_set,type='class' )

#Visualize the decision tree
library(rpart.plot)
rpart.plot(dt_fit, extra=106)

Step 9 - Evaluate the model

# Confusion matrix
table_mat <- table(test_set$classification, ypreds)
table_mat

##    ypreds
##       0   1
##   0 545   3
##   1   6 100

# Accuracy score
Accuracy=(table_mat[1]+table_mat[2])/(table_mat[3]+table_mat[1]+table_mat[4]+table_mat[2])
Accuracy*100

## [1] 84.25076

DATA 607 Project 4

Devanshu Mehrotra

11/17/2019

Step 1 - Creating a dataframe with spam emails

Step 2 - Creating a dataframe with spam emails

Step 3 - Combine the ham and spam data frames to create the complete dataframe we will be working with

Step 4 - Create a corpus

Step 5 - Clean by removing numbers,white space,punctuations. Apply regex to extract only text

Step 6 - Create a document matrix

Review top 50 terms in the combined dataset

Based on the distribution above, remove words that should not impact classification

Review the newly cleaned dataset

Step 7 - Shuffle the dataset and conduct a traintest split

Step 8 - Building a Decision Tree Classifier

Step 9 - Evaluate the model