Lab2 - Naive Bayes Model

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.2.1     ✔ forcats 0.3.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(naivebayes)

## Warning: package 'naivebayes' was built under R version 3.5.2

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

Credit Rating Dataset

creditData <- read_csv("creditData.csv")

## Parsed with column specification:
## cols(
##   .default = col_double()
## )

## See spec(...) for full column specifications.

str(creditData)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account Balance                  : num  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration of Credit (month)       : num  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment Status of Previous Credit: num  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : num  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit Amount                    : num  1049 2799 841 2122 2171 ...
##  $ Value Savings/Stocks             : num  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length of current employment     : num  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment per cent              : num  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex & Marital Status             : num  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration in Current address      : num  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most valuable available asset    : num  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age (years)                      : num  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent Credits               : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type of apartment                : num  1 1 1 1 2 1 2 2 2 1 ...
##  $ No of Credits at this Bank       : num  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : num  3 3 2 2 2 2 2 2 1 1 ...
##  $ No of dependents                 : num  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign Worker                   : num  1 1 1 2 2 2 2 2 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Creditability = col_double(),
##   ..   `Account Balance` = col_double(),
##   ..   `Duration of Credit (month)` = col_double(),
##   ..   `Payment Status of Previous Credit` = col_double(),
##   ..   Purpose = col_double(),
##   ..   `Credit Amount` = col_double(),
##   ..   `Value Savings/Stocks` = col_double(),
##   ..   `Length of current employment` = col_double(),
##   ..   `Instalment per cent` = col_double(),
##   ..   `Sex & Marital Status` = col_double(),
##   ..   Guarantors = col_double(),
##   ..   `Duration in Current address` = col_double(),
##   ..   `Most valuable available asset` = col_double(),
##   ..   `Age (years)` = col_double(),
##   ..   `Concurrent Credits` = col_double(),
##   ..   `Type of apartment` = col_double(),
##   ..   `No of Credits at this Bank` = col_double(),
##   ..   Occupation = col_double(),
##   ..   `No of dependents` = col_double(),
##   ..   Telephone = col_double(),
##   ..   `Foreign Worker` = col_double()
##   .. )

Data Preprocessing

creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))

## [1] 0

No NA values.

# 75% means 750 for training and the rest for testing
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(credit_test$Creditability))

## 
##     0     1 
## 0.256 0.744

The datasets look well distributed.

Full Model

naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model

## ================================ Naive Bayes ================================= 
## Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration of Credit (month)        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment Status of Previous Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##              
## Credit Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## # ... and 15 more tables

Confusion Matrix

conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability)
conf_nat

##    
##       0   1
##   0  42  35
##   1  22 151

The false negative percentage is higher than the false positive.

Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy

## [1] 77.2

This is an okay accuracy.

Optimization

creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
highlycor <- findCorrelation(m, 0.30)
highlycor

## [1]  5 12 19 15  3

#check how the above variables are correlated with the dependent variable
check <- credit_rand%>%select(highlycor,1)
check$Creditability<-as.numeric(check$Creditability)
cor(check)

##                                 Purpose Duration in Current address
## Purpose                      1.00000000                -0.038221345
## Duration in Current address -0.03822134                 1.000000000
## No of dependents            -0.03257687                 0.042643426
## Concurrent Credits          -0.10023039                 0.022654074
## Duration of Credit (month)   0.14749187                 0.034067202
## Creditability               -0.01797887                -0.002967159
##                             No of dependents Concurrent Credits
## Purpose                         -0.032576874        -0.10023039
## Duration in Current address      0.042643426         0.02265407
## No of dependents                 1.000000000        -0.07689064
## Concurrent Credits              -0.076890642         1.00000000
## Duration of Credit (month)      -0.023834475        -0.06288379
## Creditability                    0.003014853         0.10984410
##                             Duration of Credit (month) Creditability
## Purpose                                     0.14749187  -0.017978870
## Duration in Current address                 0.03406720  -0.002967159
## No of dependents                           -0.02383448   0.003014853
## Concurrent Credits                         -0.06288379   0.109844099
## Duration of Credit (month)                  1.00000000  -0.214926665
## Creditability                              -0.21492667   1.000000000

filteredData <- credit_rand[, -(c(6,13,20,16))]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]

Optimized Model

nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
nb_model

## ================================ Naive Bayes ================================= 
## Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = filteredTraining)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration of Credit (month)        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment Status of Previous Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##                     
## Value Savings/Stocks        0        1
##                 mean 1.711864 2.334630
##                 sd   1.340700 1.674510
## 
## # ... and 11 more tables

filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)

##                 
## filteredTestPred   0   1
##                0  43  37
##                1  21 149

conf_nat <- table(filteredTestPred, filteredTest$Creditability)
conf_nat

##                 
## filteredTestPred   0   1
##                0  43  37
##                1  21 149

Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy

## [1] 76.8

News Popularity Dataset

newsShort <- read_csv("OnlineNewsPopularity.csv")%>%
  select("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   url = col_character()
## )

## See spec(...) for full column specifications.

Data Pre-Processing

newsShort <- newsShort%>%
  mutate(popular=if_else((shares >= 1400),1,0))%>%
  select(-shares)
newsShort$popular <- as.factor(newsShort$popular)
glimpse(newsShort)

## Observations: 39,644
## Variables: 17
## $ n_tokens_title               <dbl> 12, 9, 9, 9, 13, 10, 8, 12, 11, 1...
## $ n_tokens_content             <dbl> 219, 255, 211, 531, 1072, 370, 96...
## $ n_unique_tokens              <dbl> 0.6635945, 0.6047431, 0.5751295, ...
## $ n_non_stop_words             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ num_hrefs                    <dbl> 4, 3, 3, 9, 19, 2, 21, 20, 2, 4, ...
## $ num_imgs                     <dbl> 1, 1, 1, 1, 20, 0, 20, 20, 0, 1, ...
## $ num_videos                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ average_token_length         <dbl> 4.680365, 4.913725, 4.393365, 4.4...
## $ num_keywords                 <dbl> 5, 4, 6, 7, 7, 9, 10, 9, 7, 5, 8,...
## $ kw_max_max                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ global_sentiment_polarity    <dbl> 0.09256198, 0.14894781, 0.3233333...
## $ avg_positive_polarity        <dbl> 0.3786364, 0.2869146, 0.4958333, ...
## $ title_subjectivity           <dbl> 0.5000000, 0.0000000, 0.0000000, ...
## $ title_sentiment_polarity     <dbl> -0.1875000, 0.0000000, 0.0000000,...
## $ abs_title_subjectivity       <dbl> 0.00000000, 0.50000000, 0.5000000...
## $ abs_title_sentiment_polarity <dbl> 0.1875000, 0.0000000, 0.0000000, ...
## $ popular                      <fct> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, ...

news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)

#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

Full Model

nb_model <- naive_bayes(popular ~ ., data=news_train)
nb_model

## ================================ Naive Bayes ================================= 
## Call: 
## naive_bayes.formula(formula = popular ~ ., data = news_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.4291111 0.5708889 
## 
## Tables: 
##               
## n_tokens_title        0        1
##           mean 9.820559 9.695991
##           sd   1.929249 1.987754
## 
##                 
## n_tokens_content        0        1
##             mean 452.2315 515.1051
##             sd   347.1779 450.0206
## 
##                
## n_unique_tokens         0         1
##            mean 0.5702437 0.5542023
##            sd   0.1127776 0.1232687
## 
##                 
## n_non_stop_words          0          1
##             mean 0.99404453 0.99124172
##             sd   0.07695147 0.09318398
## 
##          
## num_hrefs         0         1
##      mean  9.147851 10.570650
##      sd    8.644083 11.540711
## 
## # ... and 11 more tables

Create Prediction

news_Pred <- predict(nb_model, newdata = news_test)
conf_nat <- table(news_Pred, news_test$popular)
conf_nat

##          
## news_Pred   0   1
##         0 329 400
##         1 101 170

Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy

## [1] 49.9

Not great.

Optimization

To optimize the model, we will look how we can remove variables which are correlated with each other and remove the highly correlated ones withoput affecting the model.

newsDataScaled <- scale(news_rand[,0:(ncol(news_rand)-1)], center=TRUE, scale = TRUE)
m <- cor(newsDataScaled)
highlycor <- findCorrelation(m, 0.30)
highlycor

## [1]  3 16  2  4 12 13

These are the indices of the variables that are highly correlated with each other. Below, we run a correlation of these variables with the dependent variables

#check how the above variables are correlated with the dependent variable
check <- news_rand%>%select(3,16,2,4,12,13,17)
check$popular<-as.numeric(check$popular)
cor(check)

##                              n_unique_tokens abs_title_sentiment_polarity
## n_unique_tokens                  1.000000000                 -0.004251095
## abs_title_sentiment_polarity    -0.004251095                  1.000000000
## n_tokens_content                -0.626662198                  0.011187585
## n_non_stop_words                 0.417037438                 -0.023983298
## avg_positive_polarity            0.154500778                  0.140369050
## title_subjectivity               0.025234677                  0.725455997
## popular                         -0.062992819                  0.026636966
##                              n_tokens_content n_non_stop_words
## n_unique_tokens                  -0.626662198       0.41703744
## abs_title_sentiment_polarity      0.011187585      -0.02398330
## n_tokens_content                  1.000000000       0.10552159
## n_non_stop_words                  0.105521587       1.00000000
## avg_positive_polarity             0.078679331       0.34982582
## title_subjectivity               -0.009127765      -0.03605004
## popular                           0.067211377      -0.01946947
##                              avg_positive_polarity title_subjectivity
## n_unique_tokens                        0.154500778        0.025234677
## abs_title_sentiment_polarity           0.140369050        0.725455997
## n_tokens_content                       0.078679331       -0.009127765
## n_non_stop_words                       0.349825816       -0.036050041
## avg_positive_polarity                  1.000000000        0.081716910
## title_subjectivity                     0.081716910        1.000000000
## popular                                0.008526717        0.018061939
##                                   popular
## n_unique_tokens              -0.062992819
## abs_title_sentiment_polarity  0.026636966
## n_tokens_content              0.067211377
## n_non_stop_words             -0.019469471
## avg_positive_polarity         0.008526717
## title_subjectivity            0.018061939
## popular                       1.000000000

findCorrelation(m,0.6)

## [1]  3 16  4

Below, we create a filtered dataset by disselecting the varaibles that are highly likely to create high pairwise correlation, applied trial & error basis.

filteredData <- news_rand%>%select(-n_unique_tokens,-n_non_stop_words,-abs_title_sentiment_polarity,-num_keywords)
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]

Optimized Model

nb_model <- naive_bayes(popular ~ ., data=filteredTraining)
nb_model

## ================================ Naive Bayes ================================= 
## Call: 
## naive_bayes.formula(formula = popular ~ ., data = filteredTraining)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.4173333 0.5826667 
## 
## Tables: 
##               
## n_tokens_title        0        1
##           mean 9.616613 9.704805
##           sd   1.903096 2.013099
## 
##                 
## n_tokens_content        0        1
##             mean 475.8658 524.9130
##             sd   359.3243 447.9918
## 
##          
## num_hrefs         0         1
##      mean  9.092652 11.226545
##      sd    8.875260 11.738411
## 
##         
## num_imgs        0        1
##     mean 3.539936 3.951945
##     sd   6.977471 8.279242
## 
##           
## num_videos        0        1
##       mean 1.300319 1.212815
##       sd   5.538261 4.883789
## 
## # ... and 7 more tables

filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$popular)

##                 
## filteredTestPred  0  1
##                0 86 83
##                1 26 55

tab <- table(filteredTestPred, filteredTest$popular)
caret::confusionMatrix(tab)

## Confusion Matrix and Statistics
## 
##                 
## filteredTestPred  0  1
##                0 86 83
##                1 26 55
##                                           
##                Accuracy : 0.564           
##                  95% CI : (0.5001, 0.6264)
##     No Information Rate : 0.552           
##     P-Value [Acc > NIR] : 0.3761          
##                                           
##                   Kappa : 0.1588          
##  Mcnemar's Test P-Value : 8.148e-08       
##                                           
##             Sensitivity : 0.7679          
##             Specificity : 0.3986          
##          Pos Pred Value : 0.5089          
##          Neg Pred Value : 0.6790          
##              Prevalence : 0.4480          
##          Detection Rate : 0.3440          
##    Detection Prevalence : 0.6760          
##       Balanced Accuracy : 0.5832          
##                                           
##        'Positive' Class : 0               
##

conf_nat <- table(filteredTestPred, filteredTest$popular)
conf_nat

##                 
## filteredTestPred  0  1
##                0 86 83
##                1 26 55

Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy

## [1] 56.4

Accuracy is better.