DATA_622_KT-2

library(tm)

## Loading required package: NLP

library(SnowballC)
library(Matrix)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(ROCR)
library(caret)

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(glmnet) #logistic regression

## Loaded glmnet 4.0-2

library(e1071) #naive bayes

Compare Logistic Regression and Naive Bayes for text classification. Data set used in this SMS SPAM dataset from Kaggle.

Exploring and preparing the data

set.seed(123)

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

# examine the structure of the sms data
str(sms_raw)

## 'data.frame':    5574 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "spam" "ham" ...
##  $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...

# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)

# create DTM using TM package
sms_dtm <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix()

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., stemDocument): transformation drops documents

##Logistic Regression ###Creating train and test data

#creating train and test data
index = sample(5559, 5559*0.8)

sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])

sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test  <- Matrix(sms_test_matrix, sparse = T)

# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels  <- sms_raw[-index, ]$type
set.seed(200)
# check for the proportion of train and test
prop.table(table(sms_train_labels))

## sms_train_labels
##       ham      spam 
## 0.8659771 0.1340229

prop.table(table(sms_test_labels))

## sms_test_labels
##      ham     spam 
## 0.866016 0.133984

###Train model

tm_lr <- proc.time()
#create model and prediction
fit_ridge <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 0)
pred <- predict(fit_ridge, sms_dtm_test, type = 'response')
head(pred[,1:11])

##           s0        s1        s2        s3        s4        s5        s6
## 4  0.1340229 0.1329362 0.1328841 0.1328297 0.1327728 0.1327131 0.1326506
## 6  0.1340229 0.1368628 0.1369984 0.1371396 0.1372875 0.1374424 0.1376046
## 11 0.1340229 0.1328027 0.1327443 0.1326833 0.1326195 0.1325526 0.1324825
## 15 0.1340229 0.1337423 0.1337284 0.1337137 0.1336983 0.1336821 0.1336651
## 22 0.1340229 0.1334361 0.1334076 0.1333778 0.1333465 0.1333137 0.1332792
## 23 0.1340229 0.1329871 0.1329374 0.1328855 0.1328311 0.1327741 0.1327144
##           s7        s8        s9       s10
## 4  0.1325851 0.1325164 0.1324445 0.1323691
## 6  0.1377745 0.1379524 0.1381387 0.1383338
## 11 0.1324091 0.1323322 0.1322516 0.1321672
## 15 0.1336472 0.1336283 0.1336084 0.1335875
## 22 0.1332430 0.1332051 0.1331652 0.1331234
## 23 0.1326518 0.1325862 0.1325174 0.1324453

tm_lr

##    user  system elapsed 
##    5.71    0.82    7.25

Naive Bayes

# creating training and test datasets
sms_naive_train <- sms_dtm[index, ]
sms_naive_test  <- sms_dtm[-index, ]

# save the labels
sms_naive_train_labels <- sms_raw[index, ]$type
sms_naive_test_labels  <- sms_raw[-index, ]$type

# check for the proportion of train and test
prop.table(table(sms_naive_train_labels))

## sms_naive_train_labels
##       ham      spam 
## 0.8659771 0.1340229

prop.table(table(sms_naive_test_labels))

## sms_naive_test_labels
##      ham     spam 
## 0.866016 0.133984

# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_naive_train, 5)

# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_naive_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_naive_test[ , sms_freq_words]

# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}




set.seed(220)
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)


tm_nb <- proc.time() - tm_lr
#apply model
sms_classifier <- naiveBayes(sms_train, sms_naive_train_labels)
sms_naive_pred <- predict(sms_classifier, sms_test,type = 'raw')
tm_nb

##    user  system elapsed 
##    7.96    0.24    8.57

Conclusion:

Naive Bayes classification algorithm is effectively useful for dealing with categorical data and text mining. The fundamental theory it uses is the Bayes conditional probabilistic model for finding a posterior probability given certain conditions. It is called “Naive” because under the assumption that we believe all features (collections of words) in the dataset are equally important and independent. Using the Naive Bayes classification algorithm, we get higher % of accuracy in predicting a spam message based on the words it contains.

But when we compare the process time for both models. We don’t see much difference between naive bayes and process time for Logistic Regression.

DATA_622_KT-2_Ques6

Samriti Malhotra

10/5/2020

Exploring and preparing the data

Conclusion: