Assignment

————————————————————————————————————
————————————————————————————————————

Library Definition

library(tidyverse)
library(stringr)
library(knitr)
library(R.utils)
library(tm)
library(wordcloud)
library(topicmodels)
library(SnowballC)
library(e1071)
library(stats)

Sources for ham and spam

url.spam <- "http://spamassassin.apache.org/old/publiccorpus/"
file.spam <- "20050311_spam_2.tar.bz2"

url.ham <- "http://spamassassin.apache.org/old/publiccorpus/"
file.ham <- "20030228_easy_ham.tar.bz2"

Function to download

downloadTAR <- function(filetype=NULL, myurl=NULL, myrootfile=NULL){

  destfile <- paste(filetype,".tar", sep="")
  
  if(!file.exists(destfile)){
      myfile <- paste(myurl,myrootfile,sep="")
      destfile <- paste(filetype,".tar.bz2", sep="")

      download.file(myfile, destfile= destfile)

      bunzip2(destfile)
      # untar(destfile)
  }
  
  mycompresedfilenames <- untar(destfile, list = TRUE)
  return(mycompresedfilenames)
}

spamFileNames <- downloadTAR("Spam", url.spam, file.spam)
hamFileNames <- downloadTAR("Ham", url.ham, file.ham)

Cleanup filenames and take subset of files where names contain only 38 character

spamfiles <- str_trim(str_replace_all(spamFileNames, "spam_2/", ""))  
hamFiles <- str_trim(str_replace_all(hamFileNames, "easy_ham/", ""))


spamfiles <- subset(spamfiles, nchar(spamfiles) == 38)
hamfiles <- subset(hamFiles , nchar(hamFiles) == 38)

summary(spamfiles); summary(hamfiles) 
##    Length     Class      Mode 
##      1396 character character
##    Length     Class      Mode 
##      2500 character character

Read 2500 hamfiles and 1396 spamfiles

readFileContents <- function(importtype=NULL, filenames=NULL){
  
  if (importtype == "Spam") {
    filecon <- paste("C:\\Users\\james\\Desktop\\Education\\MS Data Analytics - CUNY\\607- Data Acquisition and Management\\Project4\\spam_2\\",filenames, sep = "")
  }
  if (importtype == "Ham") {
    filecon <- paste("C:\\Users\\james\\Desktop\\Education\\MS Data Analytics - CUNY\\607- Data Acquisition and Management\\Project4\\easy_ham\\",filenames, sep = "")
  }

  temp <- data.frame(stringsAsFactors = FALSE)
  mydata <- matrix()

  for(i in 1:length(filenames)){
    conn <- file(filecon[i], "r", blocking = FALSE)
     temp <- readLines(conn)
    close(conn)    
    temp <- str_c(temp, collapse = "")
    temp <- as.data.frame(temp, stringsAsFactors = FALSE)
    names(temp) <- "Content"
    mydata[[i]] <- temp
  }
  
  return(mydata)
}

spams <- readFileContents("Spam", spamfiles)
hams <- readFileContents("Ham", hamfiles)

Create 2 Data frames spams_df and hams_df

# Create Character Vector from Dataframe
temp <- as.character()
for (i in 1:length(spams)){
    temp[i]<- as.character(spams[[i]])
}
finalspams <- temp
rm(temp)

temp1<-as.character()

for (i in 1:length(hams)){
    temp1[i]<- as.character(hams[[i]])
}
finalhams <- temp1
rm(temp1)


spams_df <- data.frame(finalspams, stringsAsFactors = FALSE)
hams_df <- data.frame(finalhams, stringsAsFactors = FALSE)

spams_df$type <- "Spams"
hams_df$type <- "Hams"

spams_df$file <- spamfiles
hams_df$file <- hamfiles

#reorder by column index
spams_df <- spams_df[c(2,3,1)]
hams_df <- hams_df[c(2,3,1)]

names(spams_df) <- c("type","file","Content")
names(hams_df) <- c("type","file","Content")

Combining the two dataframes into one

emails_df <- bind_rows(spams_df, hams_df)

# Create Character Vector from Dataframe

finalspamsTotalEmails <- dim(spams_df)[1]

finalhamsTotalEmails <- dim(hams_df)[1]

Some results

The total number of known spams are: 1396

The total number of known hams are: 2500

Grand total of Emails: 3896

Analysis

Length of Emails

spamsLength <- nchar(spams_df$Content)
hamsLength <- nchar(hams_df$Content)

Spams Statistics

summary(spamsLength)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     725    2458    4004    6183    7020   89213

Distribution

hist(spamsLength, main="Spams Length Frequency", xlab="Length of Emails", breaks = 100)

Hams Statistics

Summary

summary(hamsLength)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     355    1644    3081    3364    4039   88593

Distribution

hist(hamsLength, main="Hams Length Frequency", xlab="Length of Emails", breaks = 100)

Median Length

spamsMedian <- median(spamsLength)
hamsMedian <- median(hamsLength)

medianDiff <- spamsMedian - hamsMedian
medianPercentile <- round(((spamsMedian / hamsMedian) - 1) * 100,2)

By running this analysis we can find out that in our pool of known ham spam emails; the Spam emails tend to have a longer Median length compared to Ham emails; that is as follows:

Median Length of Spams: 4004.

Median Length of Hams: 3081.

Difference of medians: 923.

Percentage difference: 29.96.

Using tm Package - Remove numbers,punctuation, stop words, extra white space etc

sms_corpus <- Corpus(VectorSource(emails_df$Content))

#translate all letters to lower case
clean_corpus<- tm_map(sms_corpus, content_transformer(tolower))

# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)

#inspect(clean_corpus[1:3])

# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)

# remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())

# remove extra white spaces
clean_corpus <- tm_map(clean_corpus, stripWhitespace)


# Stem
release_corpus <- tm_map(clean_corpus, content_transformer(stemDocument))

# Indices
spam_indices <- which(emails_df$type == "Spams")
ham_indices <- which(emails_df$type == "Hams")

Wordclouds

Spam

# Word Cloud
suppressMessages(suppressWarnings(wordcloud(clean_corpus[spam_indices], min.freq=250)))

Ham

# Word Cloud
suppressMessages(suppressWarnings(wordcloud(clean_corpus[ham_indices], min.freq=250)))

Training data

Divide corpus into training and test data

Use 75% training and 25% test.

# Randomize emails order
random_emails <- emails_df[sample(nrow(emails_df)),]
NEmailsQ <- dim(random_emails)[1]/4*3
NEmails <- dim(random_emails)[1]

random_emails_train <- random_emails[1:NEmailsQ,]
random_emails_test <- random_emails[NEmailsQ+1:NEmails,]

# Document-term matrix and clean corpus
emails_corpus_train <- clean_corpus[1:NEmailsQ]
emails_corpus_test <- clean_corpus[NEmailsQ+1:NEmails]


# Text to Matrix in order to Tokenize the corpus
emails_dtm_train <- DocumentTermMatrix(emails_corpus_train)
emails_dtm_train <- removeSparseTerms(emails_dtm_train, 1-(10/length(release_corpus)))

emails_dtm_test <- DocumentTermMatrix(emails_corpus_test)
emails_dtm_test <- removeSparseTerms(emails_dtm_test, 1-(10/length(release_corpus)))


emails_tdm_train <- TermDocumentMatrix(emails_corpus_train)
emails_tdm_train <- removeSparseTerms(emails_tdm_train, 1-(10/length(release_corpus)))

emails_tdm_test <- TermDocumentMatrix(emails_corpus_test)
emails_tdm_test <- removeSparseTerms(emails_tdm_test, 1-(10/length(release_corpus)))



five_times_words <- findFreqTerms(emails_dtm_train, 5)

Create document-term matrices using frequent words

emails_train <- DocumentTermMatrix(emails_corpus_train, control=list(dictionary = five_times_words))
emails_test <- DocumentTermMatrix(emails_corpus_test, control=list(dictionary = five_times_words))

Convert count information to “Yes”, “No”

Naive Bayes classification needs present or absent info on each word in a message. We have counts of occurrences. Convert the document-term matrices.

convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}
emails_train <- apply(emails_train, 2, convert_count)
emails_test <- apply(emails_test, 2, convert_count)

The Naive Bayes function

We’ll use a Naive Bayes classifier provided in the package e1071.

emails_classifier <- naiveBayes(emails_train, factor(random_emails_train$type))
emails_test_pred <- predict(emails_classifier, newdata=emails_test)

summary(emails_test_pred)
##  Hams Spams 
##   298  3598