Q6 - KT02

## Loading required package: Matrix

## Loaded glmnet 4.0-2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: NLP

#Data Wrangling for LR

sms_dtm <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix()

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., stemDocument): transformation drops documents

set.seed(243)
index = sample(5559, 5559*0.8)

sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])

sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test  <- Matrix(sms_test_matrix, sparse = T)

# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels  <- sms_raw[-index, ]$type





LRstart.time <- Sys.time()
fit_glm <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial')
LRend.time <- Sys.time()
print(paste("Time taken when data is substantial for LR ::",LRend.time - LRstart.time))

## [1] "Time taken when data is substantial for LR :: 0.199887990951538"

LRPredstart.time <- Sys.time()
pred <- predict(fit_glm, sms_dtm_test, type = 'response')
LRPredend.time <- Sys.time()
print(paste("Time taken to predict when data is substantial for LR ::",LRPredend.time - LRPredstart.time))

## [1] "Time taken to predict when data is substantial for LR :: 0.0249860286712646"

# reducing the size of the data to 100

set.seed(2432)
index = sample(50, 50*0.8)

sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])

sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test  <- Matrix(sms_test_matrix, sparse = T)

# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels  <- sms_raw[-index, ]$type


LRstart_2.time <- Sys.time()
fit_glm <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial')
LRend_2.time <- Sys.time()
print(paste("Time taken when data is reduced for LR ::",LRend_2.time - LRstart_2.time))

## [1] "Time taken when data is reduced for LR :: 0.0289850234985352"

LRPredstart_2.time <- Sys.time()
pred <- predict(fit_glm, sms_dtm_test, type = 'response')
LRPredend_2.time <- Sys.time()
print(paste("Time taken to predict when data is reduced for LR ::",LRPredstart_2.time - LRPredend_2.time))

## [1] "Time taken to predict when data is reduced for LR :: -0.0869498252868652"

#Data Wrangling for NB
sms_corpus <- VCorpus(VectorSource(sms_raw$text))  
lapply(sms_corpus[1:2], as.character)

## $`1`
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
## 
## $`2`
## [1] "Ok lar... Joking wif u oni..."

replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }

sms_dtm <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))

set.seed(242)
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test  <- sms_dtm[4170:5559, ]
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels  <- sms_raw[4170:5559, ]$type


sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]

convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)



start.time <- Sys.time()
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
end.time <- Sys.time()

print(paste("Time taken when data is substantial for NB ::",end.time - start.time))

## [1] "Time taken when data is substantial for NB :: 0.799540996551514"

NBPredstart.time <- Sys.time()
sms_test_pred <- predict(sms_classifier, sms_test)
NBPredend.time <- Sys.time()
print(paste("Time taken to predict when data is substantial for NB ::",NBPredend.time - NBPredstart.time))

## [1] "Time taken to predict when data is substantial for NB :: 37.7103359699249"

# Reducing the size of data for NB


set.seed(2421)
sms_dtm_train <- sms_dtm[1:40, ]
sms_dtm_test  <- sms_dtm[41:50, ]

sms_train_labels <- sms_raw[1:40, ]$type
sms_test_labels  <- sms_raw[41:50, ]$type

sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]

convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)



start.time <- Sys.time()
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
end.time <- Sys.time()
print(paste("Time taken when data is reduced for NB ::",end.time - start.time))

## [1] "Time taken when data is reduced for NB :: 0.00398492813110352"

NBPredstart.time <- Sys.time()
sms_test_pred <- predict(sms_classifier, sms_test)
NBPredend.time <- Sys.time()
print(paste("Time taken to predict when data is reduced for NB ::",NBPredend.time - NBPredstart.time))

## [1] "Time taken to predict when data is reduced for NB :: 0.00699687004089355"

From the above we can clearly see when data is less it is NB is slighly better in training the data very slightly but while predicting it is remarkably faster.

But as the data is substantial , LR outperforms NB in both training and also predicting .

Q6 - KT02

Vishal Arora

10/05/2020