607_Document_Classification_Jeyaraman

Cleanup Input Files

spam_compress <- 'http://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'

download.file(spam_compress, destfile="spam_compress_20030228.tar.bz2")

bunzip2("spam_compress_20030228.tar.bz2", remove = FALSE, overwrite = TRUE)

untar("spam_compress_20030228.tar") 

file.remove(list.files(path="spam_2/", full.names=TRUE, recursive=FALSE, pattern="cmds"))

## [1] TRUE

ham_compress <- 'http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'

download.file(ham_compress, destfile="ham_compress_20030228.tar.bz2")

bunzip2("ham_compress_20030228.tar.bz2", remove = FALSE, overwrite = TRUE)

untar("ham_compress_20030228.tar") 

file.remove(list.files(path="easy_ham_2/", full.names=TRUE, recursive=FALSE, pattern="cmds"))

## [1] TRUE

Merge Spam and Ham Files

spamfiles_Directory<-VCorpus(DirSource(directory = "spam_2/",encoding = "UTF-8"))
hamfiles_Directory<-VCorpus(DirSource(directory = "easy_ham_2/",encoding = "UTF-8"))

spamfiles_sample <- sample(spamfiles_Directory, 500)
hamfiles_sample <- sample(hamfiles_Directory, 500)

meta(spamfiles_sample, tag = "type") <- "spam"
meta(hamfiles_sample, tag = "type") <- "ham"

final_files <- c(spamfiles_sample,hamfiles_sample,recursive=TRUE)

final_sample <- sample(final_files)

final_sample <- tm_map(final_sample,removePunctuation)
final_sample <- tm_map(final_sample,removeNumbers)
final_sample <- tm_map(final_sample,stripWhitespace)
final_sample <- tm_map(final_sample,content_transformer(tolower))
final_sample <- tm_map(final_sample,removeWords, words = stopwords("en"))
final_sample <- tm_map(final_sample,stemDocument)

Create Document Matrix

d_matrix <- DocumentTermMatrix(final_sample)

d_matrix <- removeSparseTerms(d_matrix, 0.95)

dtm_freq_words <- colSums(as.matrix(d_matrix))


meta_spam <- unlist(meta(final_sample)[,1])

model_container <- create_container(d_matrix,labels = meta_spam,trainSize = 1:(0.8*length(meta_spam)),testSize = (0.8*length(meta_spam)+1):length(meta_spam),  virgin = FALSE)

Prepare Training and Test Models

training_model_svm <- train_model(model_container, "SVM")
training_model_rf <- train_model(model_container, "RF")
training_model_maxent <- train_model(model_container, "MAXENT")
training_model_tree <- train_model(model_container,"TREE")
training_model_boosting <- train_model(model_container,"BOOSTING")
training_model_bagging <- train_model(model_container,"BAGGING")

test_model_svm <- classify_model(model_container, training_model_svm)
test_model_rf <- classify_model(model_container, training_model_rf)
test_model_maxent <- classify_model(model_container, training_model_maxent)
test_model_tree <- classify_model(model_container, training_model_tree)
test_model_boost <- classify_model(model_container, training_model_boosting)
test_model_bagging <- classify_model(model_container, training_model_bagging)

Result of Test Models

describe(test_model_svm)

## test_model_svm 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## SVM_LABEL 
##        n  missing distinct 
##      200        0        2 
##                       
## Value        ham  spam
## Frequency     85   115
## Proportion 0.425 0.575
## ---------------------------------------------------------------------------
## SVM_PROB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      200        1   0.9672   0.0537   0.8474   0.9136 
##      .25      .50      .75      .90      .95 
##   0.9798   0.9955   0.9992   0.9997   0.9999 
## 
## lowest : 0.5028457 0.5114927 0.5621321 0.5645234 0.6341577
## highest: 0.9999183 0.9999265 0.9999638 0.9999814 0.9999901
## ---------------------------------------------------------------------------

describe(test_model_rf)

## test_model_rf 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## FORESTS_LABEL 
##        n  missing distinct 
##      200        0        2 
##                     
## Value       ham spam
## Frequency    88  112
## Proportion 0.44 0.56
## ---------------------------------------------------------------------------
## FORESTS_PROB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       50    0.991   0.9357  0.08928   0.6950   0.7595 
##      .25      .50      .75      .90      .95 
##   0.9350   0.9800   0.9950   1.0000   1.0000 
## 
## lowest : 0.500 0.510 0.550 0.580 0.615, highest: 0.980 0.985 0.990 0.995 1.000
## ---------------------------------------------------------------------------

describe(test_model_maxent)

## test_model_maxent 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## MAXENTROPY_LABEL 
##        n  missing distinct 
##      200        0        2 
##                       
## Value        ham  spam
## Frequency     87   113
## Proportion 0.435 0.565
## ---------------------------------------------------------------------------
## MAXENTROPY_PROB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      101    0.899   0.9948  0.01021   0.9995   1.0000 
##      .25      .50      .75      .90      .95 
##   1.0000   1.0000   1.0000   1.0000   1.0000 
## 
## lowest : 0.6521422 0.7800613 0.8558776 0.8605409 0.8934212
## highest: 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##                                                                 
## Value      0.650 0.780 0.855 0.860 0.895 0.955 0.975 0.995 1.000
## Frequency      1     1     1     1     1     1     1     2   191
## Proportion 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.010 0.955
## 
## For the frequency table, variable is rounded to the nearest 0.005
## ---------------------------------------------------------------------------

describe(test_model_tree)

## test_model_tree 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## TREE_LABEL 
##        n  missing distinct 
##      200        0        2 
##                       
## Value        ham  spam
## Frequency     87   113
## Proportion 0.435 0.565
## ---------------------------------------------------------------------------
## TREE_PROB 
##        n  missing distinct     Info     Mean      Gmd 
##      200        0        8    0.617   0.9705  0.05184 
## 
## lowest : 0.5555556 0.7600000 0.8125000 0.8333333 0.8709677
## highest: 0.8333333 0.8709677 0.9642857 0.9793103 1.0000000
##                                                                       
## Value      0.5555556 0.7600000 0.8125000 0.8333333 0.8709677 0.9642857
## Frequency          4         4         6         4         4        11
## Proportion     0.020     0.020     0.030     0.020     0.020     0.055
##                               
## Value      0.9793103 1.0000000
## Frequency         22       145
## Proportion     0.110     0.725
## ---------------------------------------------------------------------------

describe(test_model_boost)

## test_model_boost 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## LOGITBOOST_LABEL 
##        n  missing distinct 
##      200        0        2 
##                     
## Value       ham spam
## Frequency    88  112
## Proportion 0.44 0.56
## ---------------------------------------------------------------------------
## LOGITBOOST_PROB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       29    0.995   0.9927  0.01439   0.9968   0.9997 
##      .25      .50      .75      .90      .95 
##   1.0000   1.0000   1.0000   1.0000   1.0000 
## 
## lowest : 0.5000000 0.8807971 0.8807971 0.9820138 0.9975274
## highest: 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##                                         
## Value      0.500 0.881 0.982 0.998 1.000
## Frequency      2     3     5     5   185
## Proportion 0.010 0.015 0.025 0.025 0.925
## 
## For the frequency table, variable is rounded to the nearest 0.001
## ---------------------------------------------------------------------------

describe(test_model_bagging)

## test_model_bagging 
## 
##  2  Variables      200  Observations
## ---------------------------------------------------------------------------
## BAGGING_LABEL 
##        n  missing distinct 
##      200        0        2 
##                       
## Value        ham  spam
## Frequency     89   111
## Proportion 0.445 0.555
## ---------------------------------------------------------------------------
## BAGGING_PROB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       11    0.634   0.9584  0.06891    0.798    0.840 
##      .25      .50      .75      .90      .95 
##    0.960    1.000    1.000    1.000    1.000 
## 
## lowest : 0.52 0.56 0.64 0.68 0.76, highest: 0.84 0.88 0.92 0.96 1.00
##                                                                       
## Value       0.52  0.56  0.64  0.68  0.76  0.80  0.84  0.88  0.92  0.96
## Frequency      1     1     3     3     2     7     4    10    15    11
## Proportion 0.005 0.005 0.015 0.015 0.010 0.035 0.020 0.050 0.075 0.055
##                 
## Value       1.00
## Frequency    143
## Proportion 0.715
## ---------------------------------------------------------------------------

Observation

The test produced results with only marginal differences.

607_Document_Classification_Jeyaraman_Ramalingam

Jeyaraman Ramalingam

11/17/2019

Cleanup Input Files

Merge Spam and Ham Files

Create Document Matrix

Prepare Training and Test Models

Result of Test Models

Observation