Step 1-Collecting Data

To develop the Naive Bayes classifier, we will use data adapted from the SMS Spam Collection at http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/.

This dataset includes the text of SMS messages along with a label indicating whether the message is unwanted. Junk messages are labeled spam, while legitimate messages are labeled ham.

Step 2-Exploring and preparing the data

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
'data.frame':   5559 obs. of  2 variables:
 $ type: Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
 $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out!"| __truncated__ ...
# convert spam/ham to factor. Factor converts charactor vector to numerical like 0,1.
sms_raw$type <- factor(sms_raw$type)
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# vectorSource takes apart email,it's a reader function.Vcourpus put into a nice structure.Vcorpus stores in memory as oppose to Pcorpus, stored in Disk
# Corpus is a collection of text documents
# examine the sms corpus
print(sms_corpus)
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 5559
inspect(sms_corpus[1:2]) # summary of the 1st and 2nd SMS message in the corpus
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 2

[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 49

[[2]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 23
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[1:2], as.character) # lapply apply a procedure to each element in the data sturture
$`1`
[1] "Hope you are having a good week. Just checking in"

$`2`
[1] "K..give back my thanks."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower)) #tolower : transform all letters to lower case 
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]]) # as.charactor to view the actual text 
[1] "Hope you are having a good week. Just checking in"
as.character(sms_corpus_clean[[1]])
[1] "hope you are having a good week. just checking in"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
# remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words like and, or the
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
[1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) } # replace the punctuations in x with a space rather than remove
replacePunctuation("hello...world")
[1] "hello world"
# illustration of word stemming
library(SnowballC)  # snowwball put words into stems: eg: left to leave
wordStem(c("learn", "learned", "learning", "learns"))
[1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# create a document-term sparse matrix: DTM:rows indicates text messgaes and columns indicate words
sms_dtm <- DocumentTermMatrix(sms_corpus_clean) 
# alternative solution: create a document-term sparse matrix directly from the SMS corpus # NICE:do all the data cleaning in one step.
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))
# compare the result
sms_dtm
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm2
<<DocumentTermMatrix (documents: 5559, terms: 6909)>>
Non-/sparse entries: 43192/38363939
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm3
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
# creating training and test datasets
# Here I used dtm3 for my clean data as oppose to dtm was used in the lecture
sms_dtm_train <- sms_dtm3[1:4169, ]
sms_dtm_test  <- sms_dtm3[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type # label data by their types: ham or spam
sms_test_labels  <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar in training vs testing
prop.table(table(sms_train_labels))
sms_train_labels
      ham      spam 
0.8647158 0.1352842 
prop.table(table(sms_test_labels))
sms_test_labels
      ham      spam 
0.8683453 0.1316547 
# word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

# random =false means the higher frequence word will appear closer to the center
# min.fre=50 means the word has to accur in the corpus=50/5559=1%.
# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham  <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
<<DocumentTermMatrix (documents: 4169, terms: 1101)>>
Non-/sparse entries: 24834/4565235
Sparsity           : 99%
Maximal term length: 19
Weighting          : term frequency (tf)
# indicator features for frequent words
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
 chr [1:1136] "abiola" "abl" "abt" ...
# 5 represents at least 5 accurances
#findFreqTerms: takes a DTM and returns a character vector containing
# the words that appear for at least the specified number of times
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]  # remove colums that does not appear at least 5 times
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words] 
# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
#The ifelse(x > 0, "Yes", "No") statement transforms the values in x, so that if the value is greater than 0, then it will be replaced by "Yes",otherwise it will be replaced by a "No" string. Lastly, the newly transformed x vector is returned.
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
# margin= 2 is to apply to column, margin=1 is to row.

Step 3- Training a model on the data

library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
# what's in the classifier: conditional probability for each

Step 4-Evaluating model performance

sms_test_pred <- predict(sms_classifier, sms_test)
sms_test_pred
   [1] ham  ham  ham  ham  spam ham  ham  ham  ham 
  [10] spam ham  ham  ham  spam spam ham  ham  ham 
  [19] ham  ham  ham  ham  ham  ham  ham  ham  ham 
  [28] ham  ham  ham  ham  ham  ham  ham  spam ham 
  [37] ham  ham  ham  spam ham  ham  ham  ham  ham 
  [46] ham  ham  ham  ham  spam ham  ham  ham  ham 
  [55] ham  ham  ham  ham  ham  ham  ham  ham  ham 
  [64] ham  ham  ham  ham  ham  ham  ham  ham  ham 
  [73] ham  ham  ham  ham  ham  ham  ham  ham  spam
  [82] ham  ham  ham  ham  ham  ham  ham  ham  ham 
  [91] ham  ham  ham  ham  ham  ham  ham  spam ham 
 [100] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [109] ham  spam spam ham  ham  ham  ham  ham  ham 
 [118] ham  ham  ham  ham  ham  ham  ham  ham  spam
 [127] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [136] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [145] ham  ham  spam ham  ham  ham  ham  ham  spam
 [154] ham  spam ham  ham  ham  spam ham  ham  ham 
 [163] ham  ham  spam ham  ham  ham  spam ham  ham 
 [172] ham  ham  ham  ham  ham  ham  ham  spam ham 
 [181] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [190] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [199] ham  spam spam ham  ham  ham  ham  ham  ham 
 [208] ham  ham  spam ham  ham  ham  ham  ham  ham 
 [217] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [226] ham  ham  ham  ham  ham  ham  ham  ham  spam
 [235] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [244] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [253] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [262] ham  ham  ham  ham  ham  ham  ham  ham  spam
 [271] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [280] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [289] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [298] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [307] ham  spam ham  ham  ham  spam ham  ham  ham 
 [316] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [325] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [334] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [343] ham  ham  ham  ham  spam spam ham  ham  ham 
 [352] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [361] ham  ham  ham  ham  ham  ham  ham  spam ham 
 [370] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [379] spam ham  ham  ham  ham  spam ham  spam ham 
 [388] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [397] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [406] spam ham  ham  spam ham  ham  ham  spam spam
 [415] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [424] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [433] ham  ham  spam ham  spam ham  ham  ham  ham 
 [442] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [451] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [460] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [469] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [478] ham  spam ham  ham  ham  spam ham  ham  ham 
 [487] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [496] ham  spam ham  ham  spam spam spam ham  ham 
 [505] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [514] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [523] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [532] ham  ham  ham  ham  spam ham  spam ham  ham 
 [541] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [550] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [559] ham  ham  ham  ham  ham  spam ham  ham  ham 
 [568] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [577] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [586] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [595] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [604] ham  ham  ham  ham  spam ham  spam ham  ham 
 [613] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [622] ham  ham  ham  spam ham  ham  ham  spam ham 
 [631] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [640] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [649] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [658] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [667] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [676] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [685] ham  spam spam ham  ham  spam ham  ham  spam
 [694] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [703] ham  ham  ham  ham  ham  spam ham  ham  ham 
 [712] ham  ham  ham  ham  ham  spam ham  ham  ham 
 [721] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [730] ham  spam ham  spam ham  ham  spam ham  ham 
 [739] ham  ham  spam ham  spam ham  ham  ham  ham 
 [748] ham  spam ham  spam ham  ham  ham  ham  ham 
 [757] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [766] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [775] ham  ham  ham  ham  ham  ham  spam ham  ham 
 [784] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [793] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [802] spam ham  ham  ham  ham  ham  ham  ham  ham 
 [811] ham  ham  ham  spam ham  ham  spam ham  spam
 [820] ham  ham  ham  ham  spam ham  spam ham  spam
 [829] ham  ham  ham  spam ham  ham  ham  ham  spam
 [838] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [847] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [856] ham  ham  spam ham  ham  spam ham  ham  ham 
 [865] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [874] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [883] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [892] ham  ham  ham  ham  spam spam ham  ham  spam
 [901] ham  ham  ham  ham  ham  ham  ham  ham  spam
 [910] ham  ham  ham  ham  spam ham  ham  ham  ham 
 [919] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [928] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [937] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [946] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [955] ham  spam ham  ham  ham  ham  ham  ham  ham 
 [964] ham  ham  ham  spam ham  ham  ham  ham  ham 
 [973] ham  ham  ham  ham  ham  ham  ham  spam ham 
 [982] ham  ham  ham  ham  ham  ham  ham  ham  ham 
 [991] ham  ham  ham  spam ham  ham  ham  ham  ham 
[1000] spam
 [ reached getOption("max.print") -- omitted 390 entries ]
Levels: ham spam
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1201 |        30 |      1231 | 
             |     0.995 |     0.164 |           | 
-------------|-----------|-----------|-----------|
        spam |         6 |       153 |       159 | 
             |     0.005 |     0.836 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 

Step 5: Improving model performancestep

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1202 |        28 |      1230 | 
             |     0.996 |     0.153 |           | 
-------------|-----------|-----------|-----------|
        spam |         5 |       155 |       160 | 
             |     0.004 |     0.847 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 
## change lplace =1 makes the model a lil bit beter (cuz FN and FP decrease)
## tuning parameter is to add 1.5 to each value is that to move them a lil away from 0
## Naive Bayes is just as slow as k-NN

Accuracy when laplace=0: Accuracy=(1201+153)/1390= 97%

Accuracy when laplace=1: Accuracy=(1202+155)/1390= 98%

---
title: "Filtering spam SMS messages using Naive Bayes"
output: html_notebook
---
Step 1-Collecting Data

To develop the Naive Bayes classifier, we will use data adapted from the SMS Spam
Collection at http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/.

This dataset includes the text of SMS messages along with a label indicating
whether the message is unwanted. Junk messages are labeled spam, while
legitimate messages are labeled ham.

Step 2-Exploring and preparing the data
```{r}
# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
```
```{r}
# examine the structure of the sms data
str(sms_raw)
```
```{r}
# convert spam/ham to factor. Factor converts charactor vector to numerical like 0,1.
sms_raw$type <- factor(sms_raw$type)
```
```{r}
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# vectorSource takes apart email,it's a reader function.Vcourpus put into a nice structure.Vcorpus stores in memory as oppose to Pcorpus, stored in Disk
# Corpus is a collection of text documents
```

```{r}
# examine the sms corpus
print(sms_corpus)
inspect(sms_corpus[1:2]) # summary of the 1st and 2nd SMS message in the corpus

as.character(sms_corpus[[1]])
lapply(sms_corpus[1:2], as.character) # lapply apply a procedure to each element in the data sturture
```

```{r}
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower)) #tolower : transform all letters to lower case 
```

```{r}
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]]) # as.charactor to view the actual text 
as.character(sms_corpus_clean[[1]])
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
# remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words like and, or the
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
```

```{r}
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) } # replace the punctuations in x with a space rather than remove
replacePunctuation("hello...world")
```

```{r}
# illustration of word stemming
library(SnowballC)  # snowwball put words into stems: eg: left to leave
wordStem(c("learn", "learned", "learning", "learns"))

sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)

sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
```
```{r}
# create a document-term sparse matrix: DTM:rows indicates text messgaes and columns indicate words
sms_dtm <- DocumentTermMatrix(sms_corpus_clean) 
```

```{r}
# alternative solution: create a document-term sparse matrix directly from the SMS corpus # NICE:do all the data cleaning in one step.
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))
```

```{r}
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))

```

```{r}
# compare the result
sms_dtm
sms_dtm2
sms_dtm3
```
```{r}
# creating training and test datasets
# Here I used dtm3 for my clean data as oppose to dtm was used in the lecture
sms_dtm_train <- sms_dtm3[1:4169, ]
sms_dtm_test  <- sms_dtm3[4170:5559, ]
```

```{r}
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type # label data by their types: ham or spam
sms_test_labels  <- sms_raw[4170:5559, ]$type
```

```{r}
# check that the proportion of spam is similar in training vs testing
prop.table(table(sms_train_labels))
prop.table(table(sms_test_labels))
```
```{r}
# word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)
# random =false means the higher frequence word will appear closer to the center
# min.fre=50 means the word has to accur in the corpus=50/5559=1%.

# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham  <- subset(sms_raw, type == "ham")

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
```
```{r}
# indicator features for frequent words
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
# 5 represents at least 5 accurances
#findFreqTerms: takes a DTM and returns a character vector containing
# the words that appear for at least the specified number of times
```

```{r}
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]  # remove colums that does not appear at least 5 times
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words] 

```

```{r}
# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
#The ifelse(x > 0, "Yes", "No") statement transforms the values in x, so that if the value is greater than 0, then it will be replaced by "Yes",otherwise it will be replaced by a "No" string. Lastly, the newly transformed x vector is returned.
```

```{r}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
# margin= 2 is to apply to column, margin=1 is to row.
```

Step 3- Training a model on the data
```{r}
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
# what's in the classifier: conditional probability for each
```

Step 4-Evaluating model performance
```{r}
sms_test_pred <- predict(sms_classifier, sms_test)

sms_test_pred

library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

```
Step 5: Improving model performancestep
```{r}
sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## change lplace =1 makes the model a lil bit beter (cuz FN and FP decrease)
## tuning parameter is to add 1.5 to each value is that to move them a lil away from 0
## Naive Bayes is just as slow as k-NN
```
Accuracy when laplace=0:
Accuracy=(1201+153)/1390= 97%

Accuracy when laplace=1:
Accuracy=(1202+155)/1390= 98%


