spam_compress <- 'http://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'
download.file(spam_compress, destfile="spam_compress_20030228.tar.bz2")
bunzip2("spam_compress_20030228.tar.bz2", remove = FALSE, overwrite = TRUE)
untar("spam_compress_20030228.tar")
file.remove(list.files(path="spam_2/", full.names=TRUE, recursive=FALSE, pattern="cmds"))
## [1] TRUE
ham_compress <- 'http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
download.file(ham_compress, destfile="ham_compress_20030228.tar.bz2")
bunzip2("ham_compress_20030228.tar.bz2", remove = FALSE, overwrite = TRUE)
untar("ham_compress_20030228.tar")
file.remove(list.files(path="easy_ham_2/", full.names=TRUE, recursive=FALSE, pattern="cmds"))
## [1] TRUE
spamfiles_Directory<-VCorpus(DirSource(directory = "spam_2/",encoding = "UTF-8"))
hamfiles_Directory<-VCorpus(DirSource(directory = "easy_ham_2/",encoding = "UTF-8"))
spamfiles_sample <- sample(spamfiles_Directory, 500)
hamfiles_sample <- sample(hamfiles_Directory, 500)
meta(spamfiles_sample, tag = "type") <- "spam"
meta(hamfiles_sample, tag = "type") <- "ham"
final_files <- c(spamfiles_sample,hamfiles_sample,recursive=TRUE)
final_sample <- sample(final_files)
final_sample <- tm_map(final_sample,removePunctuation)
final_sample <- tm_map(final_sample,removeNumbers)
final_sample <- tm_map(final_sample,stripWhitespace)
final_sample <- tm_map(final_sample,content_transformer(tolower))
final_sample <- tm_map(final_sample,removeWords, words = stopwords("en"))
final_sample <- tm_map(final_sample,stemDocument)
d_matrix <- DocumentTermMatrix(final_sample)
d_matrix <- removeSparseTerms(d_matrix, 0.95)
dtm_freq_words <- colSums(as.matrix(d_matrix))
meta_spam <- unlist(meta(final_sample)[,1])
model_container <- create_container(d_matrix,labels = meta_spam,trainSize = 1:(0.8*length(meta_spam)),testSize = (0.8*length(meta_spam)+1):length(meta_spam), virgin = FALSE)
training_model_svm <- train_model(model_container, "SVM")
training_model_rf <- train_model(model_container, "RF")
training_model_maxent <- train_model(model_container, "MAXENT")
training_model_tree <- train_model(model_container,"TREE")
training_model_boosting <- train_model(model_container,"BOOSTING")
training_model_bagging <- train_model(model_container,"BAGGING")
test_model_svm <- classify_model(model_container, training_model_svm)
test_model_rf <- classify_model(model_container, training_model_rf)
test_model_maxent <- classify_model(model_container, training_model_maxent)
test_model_tree <- classify_model(model_container, training_model_tree)
test_model_boost <- classify_model(model_container, training_model_boosting)
test_model_bagging <- classify_model(model_container, training_model_bagging)
describe(test_model_svm)
## test_model_svm
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## SVM_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 85 115
## Proportion 0.425 0.575
## ---------------------------------------------------------------------------
## SVM_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 200 1 0.9672 0.0537 0.8474 0.9136
## .25 .50 .75 .90 .95
## 0.9798 0.9955 0.9992 0.9997 0.9999
##
## lowest : 0.5028457 0.5114927 0.5621321 0.5645234 0.6341577
## highest: 0.9999183 0.9999265 0.9999638 0.9999814 0.9999901
## ---------------------------------------------------------------------------
describe(test_model_rf)
## test_model_rf
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## FORESTS_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 88 112
## Proportion 0.44 0.56
## ---------------------------------------------------------------------------
## FORESTS_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 50 0.991 0.9357 0.08928 0.6950 0.7595
## .25 .50 .75 .90 .95
## 0.9350 0.9800 0.9950 1.0000 1.0000
##
## lowest : 0.500 0.510 0.550 0.580 0.615, highest: 0.980 0.985 0.990 0.995 1.000
## ---------------------------------------------------------------------------
describe(test_model_maxent)
## test_model_maxent
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## MAXENTROPY_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 87 113
## Proportion 0.435 0.565
## ---------------------------------------------------------------------------
## MAXENTROPY_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 101 0.899 0.9948 0.01021 0.9995 1.0000
## .25 .50 .75 .90 .95
## 1.0000 1.0000 1.0000 1.0000 1.0000
##
## lowest : 0.6521422 0.7800613 0.8558776 0.8605409 0.8934212
## highest: 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##
## Value 0.650 0.780 0.855 0.860 0.895 0.955 0.975 0.995 1.000
## Frequency 1 1 1 1 1 1 1 2 191
## Proportion 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.010 0.955
##
## For the frequency table, variable is rounded to the nearest 0.005
## ---------------------------------------------------------------------------
describe(test_model_tree)
## test_model_tree
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## TREE_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 87 113
## Proportion 0.435 0.565
## ---------------------------------------------------------------------------
## TREE_PROB
## n missing distinct Info Mean Gmd
## 200 0 8 0.617 0.9705 0.05184
##
## lowest : 0.5555556 0.7600000 0.8125000 0.8333333 0.8709677
## highest: 0.8333333 0.8709677 0.9642857 0.9793103 1.0000000
##
## Value 0.5555556 0.7600000 0.8125000 0.8333333 0.8709677 0.9642857
## Frequency 4 4 6 4 4 11
## Proportion 0.020 0.020 0.030 0.020 0.020 0.055
##
## Value 0.9793103 1.0000000
## Frequency 22 145
## Proportion 0.110 0.725
## ---------------------------------------------------------------------------
describe(test_model_boost)
## test_model_boost
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## LOGITBOOST_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 88 112
## Proportion 0.44 0.56
## ---------------------------------------------------------------------------
## LOGITBOOST_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 29 0.995 0.9927 0.01439 0.9968 0.9997
## .25 .50 .75 .90 .95
## 1.0000 1.0000 1.0000 1.0000 1.0000
##
## lowest : 0.5000000 0.8807971 0.8807971 0.9820138 0.9975274
## highest: 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##
## Value 0.500 0.881 0.982 0.998 1.000
## Frequency 2 3 5 5 185
## Proportion 0.010 0.015 0.025 0.025 0.925
##
## For the frequency table, variable is rounded to the nearest 0.001
## ---------------------------------------------------------------------------
describe(test_model_bagging)
## test_model_bagging
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## BAGGING_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 89 111
## Proportion 0.445 0.555
## ---------------------------------------------------------------------------
## BAGGING_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 11 0.634 0.9584 0.06891 0.798 0.840
## .25 .50 .75 .90 .95
## 0.960 1.000 1.000 1.000 1.000
##
## lowest : 0.52 0.56 0.64 0.68 0.76, highest: 0.84 0.88 0.92 0.96 1.00
##
## Value 0.52 0.56 0.64 0.68 0.76 0.80 0.84 0.88 0.92 0.96
## Frequency 1 1 3 3 2 7 4 10 15 11
## Proportion 0.005 0.005 0.015 0.015 0.010 0.035 0.020 0.050 0.075 0.055
##
## Value 1.00
## Frequency 143
## Proportion 0.715
## ---------------------------------------------------------------------------
The test produced results with only marginal differences.