NaiveBayes

# The Naive Bayes Method
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

# It is named after the Reverend Thomas Bayes (1702–1761). 
# Because of Exact Bayes method is not always practical, we use the Naive Bayes method,
# that is a good approximation of the Exact Bayes.

# Let's use the WestRoxbury dataset as an example.

d <- read.csv("WestRoxbury.csv", header = TRUE)    # load the data into memory
View(d)

# 1) Since the value of the house is numerical, we create a categorical
#    variable showing which houses are "expensive", meaning >= $600k

# 2) We see that there are many numerical but discrete variables. We
#    convert them to factor type so that Bayesian function can use them
#    as categories.

# 3) Year built variable converted to categories in a simple method,
#    yielding categories of 18, 19, 20.

# data cleaning and fixing missing values

summary(d$YR.BUILT)    # there is a typo, year built is zero!

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1920    1935    1937    1955    2011

d = d[-1493, ]         # fix this!

# Create two categories using total value column. We will try to predict
# a house if expensive or normal priced.

d$TOTAL.VALUE.CAT = as.factor(ifelse(d$TOTAL.VALUE>= 600, "Expensive", "Normal"))

d$YR.BUILT = factor(round(d$YR.BUILT/100))   # categorical years

# Other categories created
d$FLOORS = factor(d$FLOORS)
d$ROOMS = factor(d$ROOMS)
d$FIREPLACE = factor(d$FIREPLACE)
d$REMODEL = factor(d$REMODEL)

# Let's use the categorical columns to create our model:
str(d)

## 'data.frame':    5801 obs. of  15 variables:
##  $ TOTAL.VALUE    : num  344 413 330 499 332 ...
##  $ TAX            : int  4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
##  $ LOT.SQFT       : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
##  $ YR.BUILT       : Factor w/ 3 levels "18","19","20": 2 2 2 3 2 3 3 3 3 2 ...
##  $ GROSS.AREA     : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
##  $ LIVING.AREA    : int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
##  $ FLOORS         : Factor w/ 5 levels "1","1.5","2",..: 3 3 3 1 3 1 3 1 1 3 ...
##  $ ROOMS          : Factor w/ 12 levels "3","4","5","6",..: 4 8 6 7 5 4 5 4 3 6 ...
##  $ BEDROOMS       : int  3 4 4 5 3 3 3 3 3 4 ...
##  $ FULL.BATH      : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ HALF.BATH      : int  1 1 1 1 0 0 1 0 0 0 ...
##  $ KITCHEN        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FIREPLACE      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 2 1 2 1 1 2 1 ...
##  $ REMODEL        : Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
##  $ TOTAL.VALUE.CAT: Factor w/ 2 levels "Expensive","Normal": 2 2 2 2 2 2 2 2 2 2 ...

# YR.BUILT, FLOORS , ROOMS, FIREPLACE, REMODEL, TOTAL.VALUE.CAT
selected.var <- c(4,7,8,13,14,15)

dim(d)

## [1] 5801   15

# Partition 60% to 40%
set.seed(123)
train.index <- sample(c(1:dim(d)[1]), dim(d)[1]*0.6)  
train.df <- d[train.index, selected.var]
valid.df <- d[-train.index, selected.var]

# We will use the naiveBayes() from the e1071 library - install the package if you don't have it
library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

d.nb = naiveBayes(TOTAL.VALUE.CAT ~ . , data = train.df)

d.nb

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
## Expensive    Normal 
## 0.0433908 0.9566092 
## 
## Conditional probabilities:
##            YR.BUILT
## Y                    18          19          20
##   Expensive 0.013245033 0.735099338 0.251655629
##   Normal    0.002703515 0.626614599 0.370681886
## 
##            FLOORS
## Y                    1        1.5          2        2.5          3
##   Expensive 0.02649007 0.02649007 0.78807947 0.15894040 0.00000000
##   Normal    0.26524482 0.14268549 0.57885251 0.01321718 0.00000000
## 
##            ROOMS
## Y                      3            4            5            6            7
##   Expensive 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.1258278146
##   Normal    0.0000000000 0.0126164013 0.1060378492 0.2997897266 0.3103033944
##            ROOMS
## Y                      8            9           10           11           12
##   Expensive 0.1456953642 0.2715231788 0.2119205298 0.1258278146 0.0993377483
##   Normal    0.1673175128 0.0615800541 0.0297386603 0.0078101532 0.0033042956
##            ROOMS
## Y                     13           14
##   Expensive 0.0132450331 0.0066225166
##   Normal    0.0009011715 0.0006007810
## 
##            FIREPLACE
## Y                     0           1           2           3           4
##   Expensive 0.119205298 0.549668874 0.291390728 0.026490066 0.013245033
##   Normal    0.343947131 0.622108741 0.031541003 0.001802343 0.000600781
## 
##            REMODEL
## Y                None       Old    Recent
##   Expensive 0.5364238 0.1125828 0.3509934
##   Normal    0.7557825 0.1030339 0.1411835

# Target's categories in the training (ie. A-priori probabilities) :
table(train.df$TOTAL.VALUE.CAT)/dim(train.df)[1]

## 
## Expensive    Normal 
## 0.0433908 0.9566092

# Now let's apply naive Bayes model for prediction

# predict probabilities on the validation set
# type is "raw" meaning probability values is requested.

str(d$TOTAL.VALUE.CAT)

##  Factor w/ 2 levels "Expensive","Normal": 2 2 2 2 2 2 2 2 2 2 ...

pred.prob = predict(d.nb, newdata = valid.df, type = "raw")    # calculate probability of "Expensive" and "Normal"

pred.class = predict(d.nb, newdata = valid.df, type = "class")  # Calculate predicted classes in the validation

results = data.frame(actual = valid.df$TOTAL.VALUE.CAT, predicted=pred.class, probability = pred.prob )

View(results)

# confusion matrix on the training partition
pred.class <- predict(d.nb, newdata = train.df,type = "class")
confusionMatrix(pred.class, train.df$TOTAL.VALUE.CAT, positive = "Expensive")

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Expensive Normal
##   Expensive        65     59
##   Normal           86   3270
##                                           
##                Accuracy : 0.9583          
##                  95% CI : (0.9512, 0.9647)
##     No Information Rate : 0.9566          
##     P-Value [Acc > NIR] : 0.32724         
##                                           
##                   Kappa : 0.4513          
##                                           
##  Mcnemar's Test P-Value : 0.03084         
##                                           
##             Sensitivity : 0.43046         
##             Specificity : 0.98228         
##          Pos Pred Value : 0.52419         
##          Neg Pred Value : 0.97437         
##              Prevalence : 0.04339         
##          Detection Rate : 0.01868         
##    Detection Prevalence : 0.03563         
##       Balanced Accuracy : 0.70637         
##                                           
##        'Positive' Class : Expensive       
##

# Accuracy : 0.9595  
# Sensitivity : 0.44304         
# Specificity : 0.98405 

table(d$TOTAL.VALUE.CAT)/dim(d)[1]

## 
##  Expensive     Normal 
## 0.04188933 0.95811067

# confusion matrix on the validation partition
pred.class = predict(d.nb, newdata = valid.df, type = "class")  # Calculate predicted classes in the validation
confusionMatrix(pred.class, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Expensive Normal
##   Expensive        48     50
##   Normal           44   2179
##                                           
##                Accuracy : 0.9595          
##                  95% CI : (0.9507, 0.9672)
##     No Information Rate : 0.9604          
##     P-Value [Acc > NIR] : 0.6107          
##                                           
##                   Kappa : 0.4842          
##                                           
##  Mcnemar's Test P-Value : 0.6061          
##                                           
##             Sensitivity : 0.52174         
##             Specificity : 0.97757         
##          Pos Pred Value : 0.48980         
##          Neg Pred Value : 0.98021         
##              Prevalence : 0.03964         
##          Detection Rate : 0.02068         
##    Detection Prevalence : 0.04222         
##       Balanced Accuracy : 0.74965         
##                                           
##        'Positive' Class : Expensive       
##

# Accuracy : 0.9599   
# Sensitivity : 0.49412 
# Specificity : 0.97764 

# Changing the cutoff from 0.5 to 0.75

# confusion matrix on the validation partition using cutoff=0.75 for positive category

pred.prob = predict(d.nb, newdata = valid.df, type = "raw")   # get propensities
pred.prob = data.frame(pred.prob)

pred.class75 = as.factor(ifelse(pred.prob$Expensive >= 0.75, "Expensive","Normal" ))
confusionMatrix(pred.class75, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Expensive Normal
##   Expensive        31     27
##   Normal           61   2202
##                                           
##                Accuracy : 0.9621          
##                  95% CI : (0.9535, 0.9695)
##     No Information Rate : 0.9604          
##     P-Value [Acc > NIR] : 0.3600997       
##                                           
##                   Kappa : 0.3948          
##                                           
##  Mcnemar's Test P-Value : 0.0004351       
##                                           
##             Sensitivity : 0.33696         
##             Specificity : 0.98789         
##          Pos Pred Value : 0.53448         
##          Neg Pred Value : 0.97304         
##              Prevalence : 0.03964         
##          Detection Rate : 0.01336         
##    Detection Prevalence : 0.02499         
##       Balanced Accuracy : 0.66242         
##                                           
##        'Positive' Class : Expensive       
##

# Changing the cutoff from 0.5 to 0.25

pred.class = as.factor(ifelse(pred.prob$Expensive >= 0.25, "Expensive","Normal" ))
confusionMatrix(pred.class, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Expensive Normal
##   Expensive        58     93
##   Normal           34   2136
##                                           
##                Accuracy : 0.9453          
##                  95% CI : (0.9352, 0.9542)
##     No Information Rate : 0.9604          
##     P-Value [Acc > NIR] : 0.9998          
##                                           
##                   Kappa : 0.4503          
##                                           
##  Mcnemar's Test P-Value : 2.652e-07       
##                                           
##             Sensitivity : 0.63043         
##             Specificity : 0.95828         
##          Pos Pred Value : 0.38411         
##          Neg Pred Value : 0.98433         
##              Prevalence : 0.03964         
##          Detection Rate : 0.02499         
##    Detection Prevalence : 0.06506         
##       Balanced Accuracy : 0.79436         
##                                           
##        'Positive' Class : Expensive       
##

# Gain/Lift Chart

# first option with 'caret' library:
library(caret)

pred.prob = predict(d.nb, newdata = valid.df, type = "raw")   # get propensities
pred.prob = data.frame(pred.prob)

mylift <- lift(relevel(valid.df$TOTAL.VALUE.CAT, ref="Expensive") ~ pred.prob$Expensive)
xyplot(mylift, plot = "gain")

# ROC
library(pROC)

## Warning: package 'pROC' was built under R version 4.3.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

results = data.frame(actual = valid.df$TOTAL.VALUE.CAT, predicted=pred.class, probability = pred.prob )

myroc <- roc(results$actual , results$probability.Expensive, levels = c("Expensive","Normal"))

## Setting direction: controls > cases

plot(myroc, main = "ROC curve for the model",
       col = "blue", lwd = 2, legacy.axes = TRUE)    # if add = TRUE, you can add more charts on top of each

auc(myroc)

## Area under the curve: 0.9396

NaiveBayes

Satish Kumar Reddy Kalli

2025-01-08