library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ------------------------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
sms=read.csv("sms_spam.txt",stringsAsFactors = FALSE)
str(sms)
'data.frame': 5574 obs. of 2 variables:
$ type: chr "ham" "ham" "spam" "ham" ...
$ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
sms$type=sms$type%>%factor
str(sms)
'data.frame': 5574 obs. of 2 variables:
$ type: Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
$ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
The first step in processing text data involves creating a corpus,which is a collection of text documents.
Instead of following the steps on Machine learning with R, we are most interested in how we could do this with tidytext packge recently developed.
library(tidytext)
sms_tidy=sms%>%unnest_tokens(output=word,input=text,token="words")%>%anti_join(stop_words,by="word")
sms_tidy%>%head(20)
# create a similar document term matrix
names=sms%>%unnest_tokens(input=text,output=word,token="words")%>%anti_join(stop_words,by="word")%>%group_by(word)%>%summarise(n=n())%>%arrange(desc(n))%>%filter(n>5)
col_dim=dim(names)[1]
row_dim=dim(sms)[1]
library(foreach)
library(stringr)
DTM=foreach(i=1:col_dim,.combine = cbind)%do%{
ifelse(str_detect(sms$text,names$word[i]),"yes","no")%>%factor()
}
colnames(DTM)=names$word
summary(DTM[,1:7])
call 2 ur â gt 4 lt
Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.00
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.00
Median :1.000 Median :1.000 Median :1.00 Median :1.000 Median :1.000 Median :1.00 Median :1.00
Mean :1.084 Mean :1.159 Mean :1.25 Mean :1.009 Mean :1.052 Mean :1.12 Mean :1.06
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.00 3rd Qu.:1.00
Max. :2.000 Max. :2.000 Max. :2.00 Max. :2.000 Max. :2.000 Max. :2.00 Max. :2.00
library(caret)
index=createDataPartition(DTM[,1],p=0.8,list=FALSE)
DTM%>%dim
[1] 5574 1255
sms_train%>%dim
[1] 4460 1255
sms_test%>%dim
[1] 1114 1255
sms_train=DTM[index,]
sms_test=DTM[-index,]
sms_train_cl=sms$type[index]
sms_test_cl=sms$type[-index]
sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.35)
sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.35)
table(sms_pred, sms_test_cl)
sms_test_cl
sms_pred ham spam
ham 953 63
spam 20 78