Load libraries

## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
## Warning: package 'tidytext' was built under R version 3.4.4
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Create corpus and clean corpus for wordclouds

##
folders <- c("easy_ham","spam")
pathname <- "C:/Users/justin/Documents/GitHub/Justin-Data-607/week11/"

##Function for cleaning
cleancorpus <- function(Corpus){
    corpus.tmp <- tm_map(Corpus,removePunctuation)
    corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
    corpus.tmp <-tm_map(corpus.tmp,tolower)
    corpus.tmp <- tm_map(corpus.tmp, removeWords,c(stopwords("english")))
    corpus.tmp <- tm_map(corpus.tmp,stripWhitespace)
    return(corpus.tmp)
}
## Function to build Corpus for eventual word clouds
buildcorp <- function(category,path){
    s.dir <- sprintf("%s%s",path,category)
    s.cor <- Corpus(DirSource(directory=s.dir))
    clean_corpus<- cleancorpus(s.cor)
    s.tdm <- TermDocumentMatrix(clean_corpus)
    s.tdm <- removeSparseTerms(s.tdm,.7)
    
}

## If loading from github change pathname to local directory where files are extracted
my_corpus <- lapply(folders,buildcorp, path=pathname)

Display wordcloud for spam and easy_ham

##Spam
m = as.matrix(my_corpus[[1]]) 
word_freqs = sort(rowSums(m), decreasing=TRUE) 
head(word_freqs,10)
##    received         sep       esmtp   localhost     postfix         ist 
##       14315       10008        8709        7602        4825        4389 
##         thu jmlocalhost    jalapeno deliveredto 
##        4224        4136        3652        3497
dm = data.frame(word=names(word_freqs), freq=word_freqs) 
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Spam")
wordcloud(dm$word, dm$freq,max.words = 20, random.order=FALSE, colors=brewer.pal(6, "Dark2"),main="Title") 

##Easy_ham
m = as.matrix(my_corpus[[2]]) 
word_freqs = sort(rowSums(m), decreasing=TRUE) 
head(word_freqs,10)
##      received           sep           aug     localhost         table 
##          2495          2144          1200          1175          1160 
##         esmtp zzzzlocalhost          font          will         email 
##          1114           920           860           843           834
dm = data.frame(word=names(word_freqs), freq=word_freqs) 
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "easy_ham")
wordcloud(dm$word, dm$freq,max.words = 20, random.order=FALSE, colors=brewer.pal(5, "Dark2"),main="Title") 

Build a Term Document Matrix for analysis

## Function to build TDM
buildTDM <- function(category,path){
    s.dir <- sprintf("%s%s",path,category)
    s.cor <- Corpus(DirSource(directory=s.dir))
    s.cor.cl <- cleancorpus(s.cor)
    s.tdm <- DocumentTermMatrix(s.cor.cl)
    s.tdm <- removeSparseTerms(s.tdm,.7)
    result <- list(name=category, tdm=s.tdm)
}
##Function to bind two list elements of TDM together mapped with (spam/easyham)
bindcats<- function(tdm){
    s.mat <- data.matrix(tdm[["tdm"]])
    s.df <- as.data.frame(s.mat)
    s.df <- cbind(s.df, rep(tdm[["name"]],nrow(s.df)))
    colnames(s.df)[ncol(s.df)] <- "target_category"
    return(s.df)
}

##Build TDM
tdm <- lapply(folders,buildTDM, path=pathname)
my_dtm <- lapply(tdm,bindcats)


##Join list and fill na with 0
tdm_joined <- do.call(rbind.fill,my_dtm)
tdm_joined[is.na(tdm_joined)] <- 0
## Check row count should equal 2551(easy_ham)+501(spam)
print (dim(tdm_joined))
## [1] 3052   91

Create test/train split and prep for KNN

##Create test/train

train.idx <- sample(nrow(tdm_joined),.7*nrow(tdm_joined),replace = FALSE)
test.idx <- (1:nrow(tdm_joined))[-train.idx]

## Test my overall sample and training sample to see spam/notspam ratios in each
round(prop.table(table(tdm_joined$target_category))*100)
## 
## easy_ham     spam 
##       84       16
round(prop.table(table(tdm_joined[train.idx,"target_category"]))*100)
## 
## easy_ham     spam 
##       84       16
## Extract target vector and rest of DF for KNN
tdm_cats <-tdm_joined[,"target_category"] 
tdm_notcats <- tdm_joined[,!colnames(tdm_joined)%in%"target_category"]

Run KNN on k vals 1,3,15

knn_pred <- knn(tdm_notcats[train.idx,],tdm_notcats[test.idx,],tdm_cats[train.idx],k=1)
knn_pred_2<- knn(tdm_notcats[train.idx,],tdm_notcats[test.idx,],tdm_cats[train.idx],k=3)
knn_pred_3<- knn(tdm_notcats[train.idx,],tdm_notcats[test.idx,],tdm_cats[train.idx],k=15)

Confusion matrices for different K vals

confusion_matrix <-table('Predictions'=knn_pred,"Actual"=tdm_cats[test.idx]) 
kable(confusion_matrix)
easy_ham spam
easy_ham 767 2
spam 0 147
confusion_matrix_2 <-table('Predictions'=knn_pred_2,"Actual"=tdm_cats[test.idx]) 
kable(confusion_matrix_2)
easy_ham spam
easy_ham 767 2
spam 0 147
confusion_matrix_3 <-table('Predictions'=knn_pred_3,"Actual"=tdm_cats[test.idx]) 
kable(confusion_matrix_3)
easy_ham spam
easy_ham 767 8
spam 0 141
#k_list <- c(confusion_matrix,confusion_matrix_2,confusion_matrix_3)
#lapply(k_list,function(x){sum(diag(x))/length(test.idx)*100})
accuracy_knn <- sum(diag(confusion_matrix))/length(test.idx)*100
accuracy_knn
## [1] 99.78166
accuracy_knn_2 <- sum(diag(confusion_matrix_2))/length(test.idx)*100
accuracy_knn_2
## [1] 99.78166
accuracy_knn_3 <- sum(diag(confusion_matrix_3))/length(test.idx)*100
accuracy_knn_3
## [1] 99.12664

Attempt at automated cross validation with multiple K vals

trControl <- trainControl(method="cv",
             number=5)


fit <- train(target_category~.,
             method     = "knn",
             tuneGrid   = expand.grid(k = c(1,3,15,20,30)),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = tdm_joined)

fit
## k-Nearest Neighbors 
## 
## 3052 samples
##   90 predictor
##    2 classes: 'easy_ham', 'spam' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2442, 2442, 2440, 2442, 2442 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9993464  0.9976094
##    3  0.9970513  0.9891929
##   15  0.9918076  0.9695070
##   20  0.9908261  0.9657874
##   30  0.9895157  0.9607334
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.