#install.packages("sos")
#install.packages("tm")
#install.packages("wordcloud")
install.packages("e1071")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library("tm")
## Loading required package: NLP
library("wordcloud")
## Loading required package: RColorBrewer
library("e1071")
SMSSpamCollection <- read.delim("/cloud/project/Data/SMSSpamCollection.txt", header=FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
str(SMSSpamCollection)
## 'data.frame': 3184 obs. of 2 variables:
## $ V1: chr "ham" "ham" "spam" "ham" ...
## $ V2: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
#view(SMSSpamCollection)
#SMSSpamCollection$V1
names(SMSSpamCollection)[1] <- "type"
names(SMSSpamCollection)[2] <- "text"
chuyển thành factor 2 level ham and spam
SMSSpamCollection$type <- as.factor(SMSSpamCollection$type)
str(SMSSpamCollection$type)
## Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
table(SMSSpamCollection$type) # xem bao nhiêu phần tử
##
## ham spam
## 2746 438
corpus = copy – vectorSource xác định biến text như một vector
sms_corpus <- Corpus(VectorSource(SMSSpamCollection$text))
sms_corpus # xem
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3184
Xem sms_corpus
inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## [2] Ok lar... Joking wif u oni...
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
corpus_clean là copy lại có điều kiện bằng function tm_map()
corpus_clean <- tm_map(sms_corpus, tolower) # transformation to lowercase
## Warning in tm_map.SimpleCorpus(sms_corpus, tolower): transformation drops
## documents
corpus_clean <- tm_map(corpus_clean, removeNumbers) # remove number
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords()):
## transformation drops documents
# remove stop words
corpus_clean <- tm_map(corpus_clean, removePunctuation) # remove dấu chấm
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, stripWhitespace) #
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents
To look at the contents of the corpus, we can use the inspect() function.
inspect(corpus_clean[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat
## [2] ok lar joking wif u oni
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply s
## [4] u dun say early hor u c already say
## [5] nah think goes usf lives around though
create document term of matrix
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm[1:5,1:5]
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 5/20
## Sparsity : 80%
## Maximal term length: 9
## Weighting : term frequency (tf)
Create train and test set
sms_traning <- SMSSpamCollection[1:2388,]
sms_test <- SMSSpamCollection[2389:3184,]
Create Matrix train and test
sms_dtm_train <- sms_dtm[1:2388,]
sms_dtm_test <- sms_dtm[2389,3184,]
train and test of corpus
sms_corpus_train <- corpus_clean[1:2388]
sms_corpus_test <- corpus_clean[2389:3184]
To confirm that the subsets are representative of the complete set of SMS data, let’s compare the proportion of spam in the training and test data frames:
prop.table(table(sms_traning$type))
##
## ham spam
## 0.8639028 0.1360972
prop.table(table(sms_test$type))
##
## ham spam
## 0.8580402 0.1419598
typeof(corpus_clean)
## [1] "list"
#sum(corpus_clean) # Error in sum(corpus_clean) : invalid 'type' (list) of argument
corpus_clean
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3184
wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)
spam <- subset(sms_traning, type == "spam")
ham <- subset(sms_traning, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
To reduce the number of features, we will eliminate any words that appear in less than five SMS messages, or less than about 0.1 percent of records in the training data. This function takes a document term matrix and returns a character vector containing the words appearing at least a specified number of times
view_find_Freq_terms <-findFreqTerms(sms_dtm_train, 3)
ABC_sms <-list(view_find_Freq_terms) # change dictionary() bằng list()
sms_train <- DocumentTermMatrix(sms_corpus_train, ABC_sms)
sms_test <- DocumentTermMatrix(sms_corpus_test, ABC_sms)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
sms_train_new <- apply(sms_traning, MARGIN = 2, convert_counts)
sms_test_new <- apply(sms_test, MARGIN = 2, convert_counts)
sms_classifier <- naiveBayes(sms_train_new, sms_traning$type)