DATA607 Project 4 - Document Classification

Fill a corpus with the data

Spam_Dir <- DirSource("spam_2")
#Spam_c <- Corpus(Spam_Dir, readerControl = list(readPlain))
Spam_c <- VCorpus(Spam_Dir)
meta(Spam_c, "type") <- "Spam"

Ham_Dir <- DirSource("easy_ham")
Ham_c <- VCorpus(Ham_Dir)
meta(Ham_c, "type") <- "Ham"

Create Term Document Matrices

options <- list(removeNumbers=TRUE, removePunctuation=TRUE,tolower=TRUE,stopwords=TRUE,stripWhitespace=TRUE,minWordLength=2,maxWordLength=25)

TDM_spam <- TermDocumentMatrix(Spam_c,control = options)
TDM_spam <- removeSparseTerms(TDM_spam,(1-(10/length(Spam_c))))
TDM_Ham <- TermDocumentMatrix(Ham_c,control = options)
TDM_Ham <- removeSparseTerms(TDM_Ham,(1-(10/length(Ham_c))))

term_Spam_df <- as.data.frame(as.table(TDM_spam))
names(term_Spam_df) <- c("Term", "Spam_Doc", "Spam_Freq")
term_Spam_df$Spam_Doc <- "Spam"
term_Spam_df$Spam_Freq[is.na(term_Spam_df$Spam_Freq)] <- 0
term_Spam_df <- ddply(term_Spam_df, .(Term, Spam_Doc), summarize, Spam_Freq = sum(as.numeric(Spam_Freq)))

term_Ham_df <- as.data.frame(as.table(TDM_Ham))
names(term_Ham_df) <- c("Term", "Ham_Doc", "Ham_Freq")
term_Ham_df$Ham_Doc <- "Ham"
term_Ham_df$Ham_Freq[is.na(term_Ham_df$Ham_Freq)] <- 0
term_Ham_df <- ddply(term_Ham_df, .(Term, Ham_Doc), summarize, Ham_Freq = sum(as.numeric(Ham_Freq)))

Combine the two dataframes and calculate the difference between the frequency of each term in the ham files and in the spam files. Limit the final Scoring dataframe to the terms found more commonly in spam documents (specifically those with a count of more than 100 more occurences in spam)

Combo_df <- merge(term_Ham_df,term_Spam_df, by = "Term")
Combo_df$Delta <- Combo_df$Ham_Freq - Combo_df$Spam_Freq
Scoring_df <- Combo_df[order(Combo_df$Delta),]
Scoring_df <- Scoring_df[as.numeric(Scoring_df$Delta) <  -100,]

Develop a scoring scheme to determine the likelyhood of a document being SPAM

Is_It_Spam <- function(filename){
  text <- paste(readLines(filename), collapse = ' ')
  word_df <- as.data.frame(strsplit(text, "\\W+"))
  names(word_df) <- "Text"
  word_df$Text <- tolower(word_df$Text)
  nWords <- nrow(word_df)
  word_df$Delta <- 0
  #word_df <- word_df[!duplicated(word_df),]
  #print(word_df)
  
  for (i in c(1:nWords)){
    #print(i)
    #print(word_df[i,"Text"])
    wordRow <-  which(Scoring_df$Term == word_df[i,"Text"])
    #print(wordRow)
    if (!(length(wordRow) == 0)){
      word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
    }
  }
  print(mean(word_df$Delta))
  
}

Run several tests to determine a threshold between spam/ham for this equation.

Is_It_Spam("spam_2/00005.ed0aba4d386c5e62bc737cf3f0ed9589")

## [1] -99.78102

Is_It_Spam("spam_2/00011.bd8c904d9f7b161a813d222230214d50")

## [1] -108.2053

Is_It_Spam("spam_2/01237.245ac1766016b756a3ddb3b463cc9645")

## [1] -41.63551

Is_It_Spam("spam_2/00014.13574737e55e51fe6737a475b88b5052")

## [1] -24.72703

Is_It_Spam("spam_2/00663.4baa9521293a04306b038be1f65d4471")

## [1] -5.683544

Is_It_Spam("spam_2/01040.24856bbcaedd4d7b28eae47d8f89a62f")

## [1] -61.06009

Is_It_Spam("easy_ham/00010.145d22c053c1a0c410242e46c01635b3")

## [1] -12.45862

Is_It_Spam("easy_ham/00916.1ea7a40e892220d43795fee49ab4849e")

## [1] -7.327492

Is_It_Spam("easy_ham/02462.4f93bc374730a117b103d5c3a2d699f6")

## [1] -7.111332

Is_It_Spam("easy_ham/02467.b05b3925df99da2a9426f07833ce5a9d")

## [1] -7.660173

Is_It_Spam("easy_ham/02475.9277ee243e3f51fa53ed6be55798d360")

## [1] -3.173554

Is_It_Spam("easy_ham/01183.36c29b5d6d19a328c9928a157019a49c")

## [1] -14.01753

Adjust the scoring scheme based on the threshold settled on above to state whether a document is determined to be spam.

Is_It_Spam <- function(filename){
  if (is.factor(filename)){
    filename <- as.character(filename)
  }
  text <- paste(readLines(filename), collapse = ' ')
  word_df <- as.data.frame(strsplit(text, "\\W+"))
  names(word_df) <- "Text"
  word_df$Text <- tolower(word_df$Text)
  nWords <- nrow(word_df)
  word_df$Delta <- 0
  #word_df <- word_df[!duplicated(word_df),]
  #print(word_df)
  
  for (i in c(1:nWords)){
    #print(i)
    #print(word_df[i,"Text"])
    wordRow <-  which(Scoring_df$Term == word_df[i,"Text"])
    #print(wordRow)
    if (!(length(wordRow) == 0)){
      word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
    }
  }
  #print(mean(word_df$Delta))
  if (mean(word_df$Delta) < -25) {
    return("SPAM")
  } else{
    return("HAM")
  }
}

Test on all of the Spam and Ham documents.

testing_Spam <- data.frame(Spam_Files)
testing_Spam$Spam_Files <- as.character(testing_Spam$Spam_Files)
testing_Spam$Spam_Files <- str_replace(testing_Spam$Spam_Files,"\\\\","/")

testing_Ham <- data.frame(Ham_Files)
testing_Ham$Ham_Files <- as.character(testing_Ham$Ham_Files)
testing_Ham$Ham_Files <- str_replace(testing_Ham$Ham_Files,"\\\\","/")

testing_Spam$Test <- sapply(testing_Spam$Spam_Files,Is_It_Spam)
testing_Ham$Test <- sapply(testing_Ham$Ham_Files,Is_It_Spam)

Summarize results for spam and ham

print("Marking of the Spam files")

## [1] "Marking of the Spam files"

count(testing_Spam$Test)

##      x freq
## 1  HAM   80
## 2 SPAM 1317

print("Marking of the Ham Files")

## [1] "Marking of the Ham Files"

count(testing_Ham$Test)

##      x freq
## 1  HAM 2434
## 2 SPAM   67

Based on these counts, it is evident that an adjustment would need to be made to the equation. While missing 80 out of 1397 (5.7%) of spam emails and showing them in a customers inbox may be acceptable (though this is still fairly high), wrongly labeling 67 out of 2501 (2.7%) good emails as spam is unacceptable. In order to create a valid scheme for identifying spam, I would suggest looking at pairings of the words (in addition to their frequency). The equation should also be weighted in favor of the clean emails in order to avoid mistakenly labelling something as spam when it is not. Finally, adding a function that looks at the sender would help greatly with the filtering of spam/ham.

DATA607 Project 4 - Document Classification

Misha Kollontai

11/11/2019

Document Classification

Loading Packages

Access the Data (Ignoring the cmds files in each folder)

Fill a corpus with the data

Create Term Document Matrices

Combine the two dataframes and calculate the difference between the frequency of each term in the ham files and in the spam files. Limit the final Scoring dataframe to the terms found more commonly in spam documents (specifically those with a count of more than 100 more occurences in spam)

Develop a scoring scheme to determine the likelyhood of a document being SPAM

Run several tests to determine a threshold between spam/ham for this equation.

Adjust the scoring scheme based on the threshold settled on above to state whether a document is determined to be spam.

Test on all of the Spam and Ham documents.

Summarize results for spam and ham