** Assignment 607_Homework: R_Week_10a_Hmk_607_Text_Mining_Daniel_Thonn **
Summary of Assignment This assignment involves mining text data from example ham and spam files
This Assignment requires the following:
1). R-Studio
The following R-packages are used: 1.stringr 2.SnowballC 3.RTextTools 4.tm #4.tidyr #5.dplyr #6.ggplot2 #7.httr #8.tidyjason #9.data.table
Steps to reproduce: 1).Install files in (“C:/mydata”) from https://spamassassin.apache.org/publiccorpus/ 2) Unzip the two files (unzip twice each) 3).Run the R-Studio file: R_Week_10a_Hmk_607_Text_Mining_Daniel_Thonn.Rmd
Setting up and Preparing the Environment
Load the libraries needed
#install.packages("stringr")
library(stringr)
#install.packages("SnowballC")
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.3.2
#install.packages("RTextTools")
suppressWarnings(library(RTextTools))
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
#install.packages("tm")
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
##
#install.packages("tidyr")
#library(tidyr)
#install.packages("dplyr")
#library(dplyr)
#install.packages("ggplot2")
#library(ggplot2)
#install.packages("httr")
#library(httr)
#install.packages("tidyjson")
#library(tidyjson)
#install.packages("data.table")
#library(data.table)
Obtain data from: https://spamassassin.apache.org/publiccorpus/
# identify working diretory
setwd("C:/mydata")
easyham_path <- "easy_ham"
spam_path <- "spam_2"
# load Corpus for ham
corpus_ham <- Corpus((DirSource(directory=easyham_path, pattern="\\d+")),
readerControl = list(reader = readPlain))
length(corpus_ham)
## [1] 2500
# [1] 2500
#head(corpus_ham)
#check meta tags of corpus_ham
meta(corpus_ham[[1]])
## author : character(0)
## datetimestamp: 2016-11-07 01:22:36
## description : character(0)
## heading : character(0)
## id : 00001.7c53336b37003a9286aba55d2945844c
## language : en
## origin : character(0)
# load Corpus for spam
corpus_spam <- Corpus((DirSource(directory=spam_path, pattern="\\d+")),
readerControl = list(reader = readPlain))
length(corpus_spam)
## [1] 1396
# [1] 1396
#check meta tags of corpus_ham
meta(corpus_spam[[1]])
## author : character(0)
## datetimestamp: 2016-11-07 01:22:43
## description : character(0)
## heading : character(0)
## id : 00001.317e78fa8ee2f54cd4890fdc09ba8176
## language : en
## origin : character(0)
Add meta tags for ham and spam documents
# add meta tags to identify easy_ham and spam
meta(corpus_ham, tag="emailtype") <- "ham"
#check meta tags of corpus_ham
head(meta(corpus_ham))
## emailtype
## 1 ham
## 2 ham
## 3 ham
## 4 ham
## 5 ham
## 6 ham
meta(corpus_spam, tag="emailtype") <- "spam"
#check meta tags of corpus_ham
head(meta(corpus_spam))
## emailtype
## 1 spam
## 2 spam
## 3 spam
## 4 spam
## 5 spam
## 6 spam
corpus_all <- c(corpus_ham, corpus_spam)
length(corpus_all)
## [1] 3896
# [1] 3896
Check the TermDocumentMatrix and Cleanup
#tdm1 <- TermDocumentMatrix(corpus_all)
#tdm1
# remove numbers
corpus_all2 <- tm_map(corpus_all, removeNumbers)
#tdm2 <- TermDocumentMatrix(corpus_all2)
#tdm2
# remove stops words
corpus_all3 = tm_map(corpus_all2, removeWords, words=stopwords("en"))
#tdm3 <- TermDocumentMatrix(corpus_all3)
#tdm3
# stem the terms
corpus_all4 = tm_map(corpus_all3, stemDocument)
Convert to a Document Term Matrix
# convert to a Document Term Matrix
dtm1 <- DocumentTermMatrix(corpus_all4)
dtm1
## <<DocumentTermMatrix (documents: 3896, terms: 126620)>>
## Non-/sparse entries: 754821/492556699
## Sparsity : 100%
## Maximal term length: 977
## Weighting : term frequency (tf)
# remove sparse terms
dtm2 <- removeSparseTerms(dtm1, 1-(10/length(corpus_all4)))
dtm2
## <<DocumentTermMatrix (documents: 3896, terms: 6711)>>
## Non-/sparse entries: 544214/25601842
## Sparsity : 98%
## Maximal term length: 90
## Weighting : term frequency (tf)
Create sample, DocumentTermMatrix, and Container for testing and modeling
sample1 <- sample(corpus_all4,1000)
head(meta(sample1))
## emailtype
## 767 ham
## 304 ham
## 2694 spam
## 1256 ham
## 1173 ham
## 2525 spam
length(sample1)
## [1] 1000
dtm3 <- DocumentTermMatrix(sample1)
dtm3
## <<DocumentTermMatrix (documents: 1000, terms: 50190)>>
## Non-/sparse entries: 199162/49990838
## Sparsity : 100%
## Maximal term length: 273
## Weighting : term frequency (tf)
type1 <- unlist(meta(sample1, "emailtype")[,1])
type1
## [1] "ham" "ham" "spam" "ham" "ham" "spam" "ham" "spam" "ham"
## [10] "ham" "spam" "spam" "ham" "ham" "spam" "spam" "ham" "spam"
## [19] "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham" "ham"
## [28] "spam" "ham" "spam" "spam" "ham" "ham" "ham" "ham" "ham"
## [37] "ham" "spam" "ham" "ham" "ham" "ham" "spam" "spam" "ham"
## [46] "ham" "spam" "ham" "ham" "ham" "ham" "ham" "ham" "spam"
## [55] "ham" "ham" "spam" "spam" "ham" "ham" "ham" "ham" "ham"
## [64] "spam" "ham" "ham" "ham" "ham" "spam" "spam" "spam" "ham"
## [73] "ham" "spam" "ham" "spam" "ham" "spam" "spam" "ham" "ham"
## [82] "ham" "spam" "spam" "ham" "ham" "spam" "spam" "ham" "spam"
## [91] "ham" "spam" "spam" "ham" "ham" "ham" "spam" "ham" "ham"
## [100] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham" "spam"
## [109] "spam" "ham" "ham" "ham" "ham" "spam" "spam" "ham" "ham"
## [118] "ham" "spam" "spam" "ham" "ham" "ham" "ham" "ham" "spam"
## [127] "ham" "spam" "ham" "ham" "ham" "ham" "ham" "ham" "spam"
## [136] "spam" "ham" "spam" "spam" "ham" "spam" "ham" "spam" "ham"
## [145] "spam" "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham"
## [154] "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham" "ham"
## [163] "ham" "spam" "spam" "ham" "ham" "spam" "spam" "spam" "ham"
## [172] "ham" "spam" "ham" "spam" "ham" "spam" "spam" "ham" "ham"
## [181] "spam" "spam" "spam" "ham" "spam" "ham" "ham" "ham" "spam"
## [190] "spam" "spam" "spam" "ham" "spam" "ham" "ham" "ham" "ham"
## [199] "ham" "spam" "ham" "ham" "ham" "spam" "ham" "ham" "ham"
## [208] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham" "ham"
## [217] "spam" "ham" "ham" "ham" "ham" "ham" "ham" "ham" "ham"
## [226] "spam" "spam" "ham" "spam" "ham" "spam" "ham" "spam" "ham"
## [235] "ham" "ham" "spam" "spam" "spam" "ham" "spam" "ham" "ham"
## [244] "ham" "ham" "spam" "ham" "ham" "spam" "ham" "ham" "spam"
## [253] "spam" "ham" "spam" "ham" "spam" "ham" "spam" "ham" "spam"
## [262] "ham" "spam" "ham" "ham" "spam" "ham" "spam" "ham" "ham"
## [271] "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham"
## [280] "ham" "ham" "ham" "ham" "ham" "ham" "ham" "ham" "ham"
## [289] "ham" "spam" "ham" "spam" "ham" "ham" "ham" "spam" "spam"
## [298] "spam" "ham" "spam" "spam" "ham" "spam" "ham" "ham" "ham"
## [307] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "ham"
## [316] "ham" "ham" "ham" "spam" "ham" "spam" "ham" "spam" "spam"
## [325] "ham" "spam" "ham" "ham" "spam" "spam" "ham" "ham" "ham"
## [334] "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham" "spam"
## [343] "ham" "ham" "spam" "ham" "ham" "ham" "ham" "ham" "spam"
## [352] "ham" "ham" "spam" "ham" "ham" "spam" "ham" "ham" "spam"
## [361] "ham" "ham" "spam" "ham" "ham" "ham" "spam" "spam" "spam"
## [370] "spam" "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham"
## [379] "spam" "spam" "ham" "spam" "ham" "ham" "spam" "spam" "ham"
## [388] "ham" "spam" "spam" "ham" "ham" "spam" "ham" "ham" "spam"
## [397] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "spam" "ham"
## [406] "ham" "ham" "ham" "ham" "spam" "ham" "ham" "spam" "ham"
## [415] "ham" "spam" "spam" "ham" "spam" "ham" "spam" "ham" "ham"
## [424] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "spam"
## [433] "ham" "spam" "spam" "ham" "spam" "ham" "spam" "ham" "ham"
## [442] "spam" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "spam"
## [451] "spam" "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham"
## [460] "ham" "ham" "ham" "ham" "spam" "ham" "spam" "spam" "spam"
## [469] "ham" "ham" "spam" "ham" "ham" "spam" "spam" "ham" "spam"
## [478] "ham" "ham" "spam" "spam" "spam" "ham" "ham" "ham" "spam"
## [487] "ham" "spam" "spam" "ham" "ham" "ham" "spam" "ham" "ham"
## [496] "ham" "ham" "spam" "ham" "ham" "ham" "ham" "spam" "ham"
## [505] "ham" "spam" "ham" "ham" "ham" "spam" "spam" "spam" "ham"
## [514] "spam" "ham" "spam" "ham" "ham" "spam" "spam" "spam" "ham"
## [523] "ham" "ham" "ham" "spam" "spam" "spam" "spam" "spam" "spam"
## [532] "ham" "spam" "spam" "ham" "ham" "spam" "ham" "ham" "spam"
## [541] "ham" "spam" "ham" "ham" "ham" "ham" "spam" "ham" "ham"
## [550] "ham" "ham" "spam" "ham" "spam" "spam" "spam" "ham" "ham"
## [559] "ham" "ham" "spam" "ham" "ham" "ham" "ham" "ham" "ham"
## [568] "ham" "ham" "spam" "ham" "ham" "spam" "ham" "spam" "ham"
## [577] "ham" "spam" "ham" "spam" "ham" "ham" "ham" "spam" "spam"
## [586] "ham" "ham" "spam" "ham" "ham" "ham" "spam" "ham" "spam"
## [595] "ham" "ham" "ham" "spam" "ham" "spam" "spam" "ham" "ham"
## [604] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham" "spam"
## [613] "ham" "spam" "ham" "ham" "ham" "spam" "ham" "spam" "ham"
## [622] "spam" "ham" "ham" "ham" "spam" "ham" "spam" "spam" "spam"
## [631] "ham" "spam" "spam" "spam" "spam" "spam" "ham" "spam" "spam"
## [640] "spam" "ham" "ham" "ham" "ham" "ham" "ham" "ham" "ham"
## [649] "ham" "ham" "ham" "spam" "ham" "ham" "spam" "ham" "ham"
## [658] "spam" "ham" "spam" "spam" "ham" "ham" "ham" "ham" "ham"
## [667] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "ham"
## [676] "spam" "spam" "spam" "spam" "spam" "ham" "ham" "ham" "spam"
## [685] "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham" "ham"
## [694] "spam" "ham" "spam" "ham" "spam" "ham" "ham" "ham" "ham"
## [703] "spam" "spam" "ham" "ham" "ham" "spam" "ham" "ham" "ham"
## [712] "spam" "ham" "spam" "spam" "ham" "spam" "ham" "ham" "ham"
## [721] "ham" "ham" "spam" "spam" "spam" "spam" "ham" "ham" "spam"
## [730] "ham" "ham" "spam" "ham" "ham" "spam" "spam" "spam" "ham"
## [739] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "spam" "spam"
## [748] "spam" "ham" "spam" "ham" "spam" "ham" "spam" "spam" "spam"
## [757] "ham" "ham" "ham" "ham" "spam" "ham" "spam" "ham" "spam"
## [766] "spam" "ham" "ham" "ham" "spam" "ham" "spam" "ham" "ham"
## [775] "ham" "ham" "spam" "spam" "ham" "spam" "ham" "ham" "spam"
## [784] "ham" "spam" "ham" "ham" "ham" "ham" "spam" "ham" "spam"
## [793] "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham" "spam"
## [802] "ham" "ham" "spam" "ham" "ham" "ham" "ham" "ham" "spam"
## [811] "ham" "spam" "ham" "spam" "spam" "spam" "ham" "ham" "spam"
## [820] "ham" "ham" "ham" "ham" "spam" "spam" "ham" "ham" "spam"
## [829] "ham" "ham" "ham" "spam" "ham" "spam" "ham" "spam" "ham"
## [838] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham"
## [847] "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham" "ham"
## [856] "ham" "ham" "ham" "spam" "ham" "spam" "ham" "ham" "spam"
## [865] "ham" "ham" "ham" "spam" "spam" "spam" "spam" "ham" "spam"
## [874] "spam" "ham" "ham" "spam" "ham" "spam" "spam" "spam" "ham"
## [883] "ham" "spam" "ham" "spam" "ham" "spam" "ham" "spam" "ham"
## [892] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "ham"
## [901] "ham" "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham"
## [910] "ham" "ham" "spam" "ham" "spam" "spam" "ham" "ham" "spam"
## [919] "spam" "spam" "spam" "ham" "spam" "ham" "ham" "ham" "ham"
## [928] "ham" "ham" "spam" "ham" "ham" "ham" "spam" "spam" "spam"
## [937] "ham" "ham" "ham" "ham" "ham" "spam" "ham" "spam" "ham"
## [946] "ham" "spam" "ham" "ham" "ham" "ham" "ham" "spam" "spam"
## [955] "spam" "ham" "ham" "ham" "ham" "ham" "spam" "spam" "ham"
## [964] "ham" "ham" "spam" "ham" "spam" "spam" "ham" "ham" "ham"
## [973] "ham" "spam" "ham" "ham" "spam" "spam" "ham" "ham" "ham"
## [982] "spam" "ham" "ham" "spam" "spam" "ham" "ham" "ham" "ham"
## [991] "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham" "ham"
## [1000] "ham"
container1 <- create_container(dtm3, labels = type1, trainSize = 1:400, testSize = 401:length(type1), virgin = FALSE)
Create Training Models
# training models
svm_model_1 <- train_model(container1 , "SVM")
boosting_model_1 <- train_model(container1 , "BOOSTING")
glmnet_model_1 <- train_model(container1 , "GLMNET")
maxent_model_1 <- train_model(container1 , "MAXENT")
# classification
svm_classified_1 <- classify_model(container1, svm_model_1)
rf_classified_1 <- classify_model(container1, boosting_model_1)
glmnet_classified_1 <- classify_model(container1, glmnet_model_1)
maxent_classified_1 <- classify_model(container1, maxent_model_1)
# create dataframes for analysis
classification_DF <- data.frame(
label = type1[401:length(type1)],
svm = svm_classified_1[,1],
rf = rf_classified_1[,1],
glmnet = glmnet_classified_1[,1],
maxent = maxent_classified_1[,1],
stringsAsFactors = F)
# preview results
head(classification_DF)
## label svm rf glmnet maxent
## 1 ham ham ham ham ham
## 2 ham ham ham ham ham
## 3 ham ham ham ham ham
## 4 spam spam spam spam spam
## 5 ham ham ham ham ham
## 6 ham ham ham ham ham
Review Results
##Support Vector Machine Results
prop.table(table(classification_DF[,1] == classification_DF[,2]))
##
## FALSE TRUE
## 0.01333333 0.98666667
#FALSE TRUE
#0.005 0.995
##Random Forest Results
prop.table(table(classification_DF[,1] == classification_DF[,3]))
##
## FALSE TRUE
## 0.006666667 0.993333333
# FALSE TRUE
#0.001666667 0.998333333
##glmnet Results
prop.table(table(classification_DF[,1] == classification_DF[,4]))
##
## FALSE TRUE
## 0.02166667 0.97833333
#FALSE TRUE
#0.005 0.995
##Max-Entropy Results
prop.table(table(classification_DF[,1] == classification_DF[,5]))
##
## FALSE TRUE
## 0.01 0.99
#FALSE TRUE
#0.005 0.995
Conclusion:
Identifying spam was successful with higher than 95% for each model. This was for the easy spam files. Further work would result in apply the same techniquest to harder more difficult to identify spam files and iterate until best results are achieved.
END