#install.packages("sos")
#install.packages("tm")
#install.packages("wordcloud")
install.packages("e1071")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library("tm")
## Loading required package: NLP
library("wordcloud")
## Loading required package: RColorBrewer
library("e1071")
SMSSpamCollection <- read.delim("/cloud/project/Data/SMSSpamCollection.txt", header=FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
str(SMSSpamCollection)
## 'data.frame':    3184 obs. of  2 variables:
##  $ V1: chr  "ham" "ham" "spam" "ham" ...
##  $ V2: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
#view(SMSSpamCollection)
#SMSSpamCollection$V1 
names(SMSSpamCollection)[1] <- "type"
names(SMSSpamCollection)[2] <- "text"

chuyển thành factor 2 level ham and spam

SMSSpamCollection$type <- as.factor(SMSSpamCollection$type)
str(SMSSpamCollection$type)
##  Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
table(SMSSpamCollection$type) # xem bao nhiêu phần tử
## 
##  ham spam 
## 2746  438

corpus = copy – vectorSource xác định biến text như một vector

sms_corpus <- Corpus(VectorSource(SMSSpamCollection$text))
sms_corpus # xem
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3184

Xem sms_corpus

inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            
## [2] Ok lar... Joking wif u oni...                                                                                                                              
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

corpus_clean là copy lại có điều kiện bằng function tm_map()

corpus_clean <- tm_map(sms_corpus, tolower) # transformation to lowercase 
## Warning in tm_map.SimpleCorpus(sms_corpus, tolower): transformation drops
## documents
corpus_clean <- tm_map(corpus_clean, removeNumbers) # remove number
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords()):
## transformation drops documents
# remove stop words 
corpus_clean <- tm_map(corpus_clean, removePunctuation) # remove dấu chấm
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, stripWhitespace) # 
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents

To look at the contents of the corpus, we can use the inspect() function.

inspect(corpus_clean[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat                     
## [2] ok lar joking wif u oni                                                                                
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply s
## [4] u dun say early hor u c already say                                                                    
## [5] nah think goes usf lives around though

create document term of matrix

sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm[1:5,1:5]
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 5/20
## Sparsity           : 80%
## Maximal term length: 9
## Weighting          : term frequency (tf)

Create train and test set

sms_traning <- SMSSpamCollection[1:2388,]
sms_test <- SMSSpamCollection[2389:3184,]

Create Matrix train and test

sms_dtm_train <- sms_dtm[1:2388,]
sms_dtm_test <- sms_dtm[2389,3184,]

train and test of corpus

sms_corpus_train <- corpus_clean[1:2388]
sms_corpus_test <- corpus_clean[2389:3184]

To confirm that the subsets are representative of the complete set of SMS data, let’s compare the proportion of spam in the training and test data frames:

prop.table(table(sms_traning$type))
## 
##       ham      spam 
## 0.8639028 0.1360972
prop.table(table(sms_test$type))
## 
##       ham      spam 
## 0.8580402 0.1419598
typeof(corpus_clean)
## [1] "list"
#sum(corpus_clean) # Error in sum(corpus_clean) : invalid 'type' (list) of argument
corpus_clean
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3184
wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)

spam <- subset(sms_traning, type == "spam")
ham <- subset(sms_traning, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

To reduce the number of features, we will eliminate any words that appear in less than five SMS messages, or less than about 0.1 percent of records in the training data. This function takes a document term matrix and returns a character vector containing the words appearing at least a specified number of times

view_find_Freq_terms <-findFreqTerms(sms_dtm_train, 3)
ABC_sms <-list(view_find_Freq_terms) # change dictionary() bằng list()
sms_train <- DocumentTermMatrix(sms_corpus_train, ABC_sms)
sms_test <- DocumentTermMatrix(sms_corpus_test, ABC_sms)
convert_counts <- function(x) {
 x <- ifelse(x > 0, 1, 0)
 x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
 return(x)
}
sms_train_new <- apply(sms_traning, MARGIN = 2, convert_counts)
sms_test_new <- apply(sms_test, MARGIN = 2, convert_counts)
sms_classifier <- naiveBayes(sms_train_new, sms_traning$type)