I use data from the (UCI Machine Learning Repository)[http://archive.ics.uci.edu/ml/].
library(tidyverse)
getwd()
[1] "C:/Users/edward cooper/Google Drive/Learn R"
sms=read.csv("sms_spam.csv",stringsAsFactors = FALSE)
str(sms)
'data.frame': 5574 obs. of 2 variables:
$ type: chr "ham" "ham" "spam" "ham" ...
$ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
sms$type=sms$type%>%factor
str(sms)
'data.frame': 5574 obs. of 2 variables:
$ type: Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
$ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
The first step in processing text data involves creating a corpus, which is a collection of text documents.
I am most interested in how we could do this with tidytext packge recently developed.
library(tidytext)
sms_tidy=sms%>%unnest_tokens(output=word,input=text,token="words")%>%anti_join(stop_words,by="word")
sms_tidy%>%head(20)
#install.packages("wordcloud")
library(wordcloud)
sms_sum=sms_tidy%>%group_by(word)%>%summarise(n=n())%>%arrange(desc(n))
sms_sum%>%head
wordcloud(words=sms_sum$word,freq=sms_sum$n,min.freq = 50,random.order = FALSE,random.color = FALSE)
# create a similar document term matrix
names=sms%>%unnest_tokens(input=text,output=word,token="words")%>%anti_join(stop_words,by="word")%>%group_by(word)%>%summarise(n=n())%>%arrange(desc(n))%>%filter(n>5)
col_dim=dim(names)[1]
row_dim=dim(sms)[1]
library(foreach)
library(stringr)
DTM=foreach(i=1:col_dim,.combine = cbind)%do%{
ifelse(str_detect(sms$text,names$word[i]),"yes","no")%>%factor()
}
colnames(DTM)=names$word
summary(DTM[,1:7])
call 2 ur â gt 4
Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.00
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
Median :1.000 Median :1.000 Median :1.00 Median :1.000 Median :1.000 Median :1.00
Mean :1.084 Mean :1.159 Mean :1.25 Mean :1.009 Mean :1.052 Mean :1.12
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.00
Max. :2.000 Max. :2.000 Max. :2.00 Max. :2.000 Max. :2.000 Max. :2.00
lt
Min. :1.00
1st Qu.:1.00
Median :1.00
Mean :1.06
3rd Qu.:1.00
Max. :2.00
library(caret)
index=createDataPartition(DTM[,1],p=0.8,list=FALSE)
sms_train=DTM[index,]
sms_test=DTM[-index,]
sms_train_cl=sms$type[index]
sms_test_cl=sms$type[-index]
sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.1)
sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.1)
confusionMatrix(sms_pred, sms_test_cl)
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 902 38
spam 53 121
Accuracy : 0.9183
95% CI : (0.9007, 0.9337)
No Information Rate : 0.8573
P-Value [Acc > NIR] : 2.881e-10
Kappa : 0.6788
Mcnemar's Test P-Value : 0.1422
Sensitivity : 0.9445
Specificity : 0.7610
Pos Pred Value : 0.9596
Neg Pred Value : 0.6954
Prevalence : 0.8573
Detection Rate : 0.8097
Detection Prevalence : 0.8438
Balanced Accuracy : 0.8528
'Positive' Class : ham