It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.3.3
## Loading required package: bitops
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.3
library(XML)
## Warning: package 'XML' was built under R version 3.3.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.3.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library("rpart")
#Get 2500 easy_ham url from github cmds file
ham_list<-"https://raw.githubusercontent.com/czhu505/W10_607/master/easy_ham/cmds"
ham<- readLines(ham_list)
easy_ham<- sapply(strsplit(ham," "),"[[",2)
ham_path <- "https://github.com/czhu505/W10_607/tree/master/easy_ham/"
#create a empty vertor to store the ham_url
ham_url=c()
for(i in 1:length(easy_ham)){
ham_url[i] <- paste0(ham_path,easy_ham[i])
}
#Obeserve ham folder
head(ham_url,3)
## [1] "https://github.com/czhu505/W10_607/tree/master/easy_ham/00001.7c53336b37003a9286aba55d2945844c"
## [2] "https://github.com/czhu505/W10_607/tree/master/easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "https://github.com/czhu505/W10_607/tree/master/easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7"
#Get 1400 spam_2 url
spam_list<-"https://raw.githubusercontent.com/czhu505/W10_607/master/spam_2/cmds"
spam<- readLines(spam_list)
spam_2<- sapply(strsplit(spam," "),"[[",2)
#correct the mistakes address in cmds file
spam_2[7]<-"00007.acefeee792b5298f8fee175f9f65c453"
spam_path<- "https://raw.githubusercontent.com/czhu505/W10_607/master/spam_2/"
#create a empty vertor to store the spam_url
spam_url=c()
for(i in 1:length(spam_2)){
spam_url[i] <- paste0(spam_path,spam_2[i])
}
#Obeserve spam folder
head(spam_url,3)
## [1] "https://raw.githubusercontent.com/czhu505/W10_607/master/spam_2/00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "https://raw.githubusercontent.com/czhu505/W10_607/master/spam_2/00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "https://raw.githubusercontent.com/czhu505/W10_607/master/spam_2/00003.590eff932f8704d8b0fcbe69d023b54d"
#create an empty vector for ham data set
ham.data=c()
ham=c()
#read in ham file url one by one, and store in the ham data set vector
for(i in 1:50) {
links <- readLines(ham_url[i])
ham<- paste(links, collapse = ' ')
ham.text<-data.frame(Spam=0,text=ham)
#combine all ham files, one file in a string
ham.data<- rbind(ham.data,ham.text)
}
#create an empty vector for spam data set
spam.data=c()
spam=c()
#read in spam file url one by one, and store in the spam data set vector
for(i in 1:50) {
links <- readLines(spam_url[i])
spam <- paste(links, collapse = ' ')
ham.text<-data.frame(Spam=1,text=spam)
#combine 50 spam files, one file in a string
spam.data<- rbind(spam.data,ham.text)
}
cleanCorpus<-function(coropus){
coropus <-Corpus(VectorSource(coropus))
corpus.tmp<-tm_map(coropus,removePunctuation)
corpus.tmp<-tm_map(corpus.tmp,stripWhitespace)
corpus.tmp<-tm_map(corpus.tmp,tolower)
corpus.tmp<-tm_map(corpus.tmp,removeWords,stopwords("english"))
return(corpus.tmp)
}
ham.corpus<-cleanCorpus(ham.data$text)
ham.dtm = DocumentTermMatrix(ham.corpus)
ham.dtm
## <<DocumentTermMatrix (documents: 50, terms: 7063)>>
## Non-/sparse entries: 71033/282117
## Sparsity : 80%
## Maximal term length: 261
## Weighting : term frequency (tf)
ham.spdtm = removeSparseTerms(ham.dtm, 0.95)
ham.spdtm
## <<DocumentTermMatrix (documents: 50, terms: 2047)>>
## Non-/sparse entries: 65362/36988
## Sparsity : 36%
## Maximal term length: 114
## Weighting : term frequency (tf)
spam.corpus<-cleanCorpus(spam.data$text)
spam.dtm = DocumentTermMatrix(spam.corpus)
spam.dtm
## <<DocumentTermMatrix (documents: 50, terms: 8091)>>
## Non-/sparse entries: 14497/390053
## Sparsity : 96%
## Maximal term length: 251
## Weighting : term frequency (tf)
spam.spdtm = removeSparseTerms(spam.dtm, 0.95)
spam.spdtm
## <<DocumentTermMatrix (documents: 50, terms: 934)>>
## Non-/sparse entries: 6381/40319
## Sparsity : 86%
## Maximal term length: 38
## Weighting : term frequency (tf)
Aworldcloud <- function(content)
{
wordcloud(words = content$word, freq = content$freq, min.freq = 20,
max.words=100, random.order=FALSE, rot.per=0.35,scale=c(2.5, .01),
colors=brewer.pal(8, "Dark2"))
}
ham.Sparse = as.data.frame(as.matrix(ham.spdtm))
colnames(ham.Sparse ) = make.names(colnames(ham.Sparse ))
ham.freq<-sort(colSums(ham.Sparse), decreasing=TRUE)
head(ham.freq,30)
## div blobcodeinner
## 5120 4133
## classblobcode classblobnum
## 4133 4133
## jslinenumber meta
## 4133 2300
## jsfileline span
## 1456 1355
## svg ariahiddentrue
## 1050 1000
## classocticon fillruleevenodd
## 1000 1000
## version11 viewbox0
## 1000 1000
## height16 link
## 945 852
## button X1993
## 760 550
## datagaclickfooter classjsselectednavigationitem
## 550 500
## aug btnsm
## 484 450
## classmr3a nav
## 450 450
## X2002 jsfilelinereceived
## 431 361
## typebutton repository
## 355 351
## signed reldnsprefetch
## 351 350
ham.df<-data.frame(word = names(ham.freq),freq=ham.freq)
spam.Sparse = as.data.frame(as.matrix(spam.spdtm))
colnames(spam.Sparse ) = make.names(colnames(spam.Sparse ))
spam.freq<-sort(colSums(spam.Sparse),decreasing=TRUE)
spam.df<-data.frame(word = names(spam.freq),freq=spam.freq)
From the wordcloud analysis, some word have very high frequency in Ham, and the other word is fairlly samll. In oppostive, word frequency is much less different.
#par(mfrow = c(1, 2))
Aworldcloud (ham.df)
Aworldcloud (spam.df)
createBarChart <- function(d, xCol, yCol, title)
{
ggplot(d[1:10,], aes(x = reorder(get(xCol),-get(yCol)), y = get(yCol))) +
geom_bar(stat = "identity") +
geom_bar(stat = "identity", aes(fill = get(yCol))) +
scale_fill_gradient(low = "pink", high = "brown") +
theme_bw() +
ylab("Count") +
xlab("Word") +
ggtitle(title) +
theme(
plot.title = element_text(size = rel(1.5), color = "black", face = "bold", hjust = 0.5),
plot.margin = unit(c(.5,1,1,1), "cm"),
axis.title.x = element_text(color="black", size=rel(1.3), face="bold"),
axis.title.y = element_text(color="black", size=rel(1.3), face="bold"),
axis.text.x = element_text(color="black", size=10, angle = 45, hjust = 1),
axis.text.y = element_text(color="black", size=10),
legend.title=element_blank(),
legend.position="none")
}
#par(mfrow = c(1, 2))
createBarChart(ham.df,"word","freq","Top Ten Ham Words")
createBarChart(spam.df,"word","freq","Top Ten Spam Words")
#ham.freq["spam"] <-0
#spam.freq["spam"] <-1
#ham.freq= data.frame(ham.freq)
#colnames(ham.freq)<-c("freq","spam")
#spam.freq= data.frame(spam.freq)
#colnames(spam.freq)<-c("freq","spam")
#alldata<-dplyr::bind_rows(ham.freq, spam.freq)
#set.seed(50)
#spl = sample.split(alldata$spam, 0.7)
#train = subset(alldata, spl == TRUE)
#test = subset(alldata, spl == FALSE)
#alg= glm(spam~freq, data=train, family="binomial")
#summary(alg)
#Test.pred= predict(alg, newdata=test, type="response")
#table(test$spam, Test.pred > 0.5)