# The Naive Bayes Method
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
# It is named after the Reverend Thomas Bayes (1702–1761).
# Because of Exact Bayes method is not always practical, we use the Naive Bayes method,
# that is a good approximation of the Exact Bayes.
# Let's use the WestRoxbury dataset as an example.
d <- read.csv("WestRoxbury.csv", header = TRUE) # load the data into memory
View(d)
# 1) Since the value of the house is numerical, we create a categorical
# variable showing which houses are "expensive", meaning >= $600k
# 2) We see that there are many numerical but discrete variables. We
# convert them to factor type so that Bayesian function can use them
# as categories.
# 3) Year built variable converted to categories in a simple method,
# yielding categories of 18, 19, 20.
# data cleaning and fixing missing values
summary(d$YR.BUILT) # there is a typo, year built is zero!
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1920 1935 1937 1955 2011
d = d[-1493, ] # fix this!
# Create two categories using total value column. We will try to predict
# a house if expensive or normal priced.
d$TOTAL.VALUE.CAT = as.factor(ifelse(d$TOTAL.VALUE>= 600, "Expensive", "Normal"))
d$YR.BUILT = factor(round(d$YR.BUILT/100)) # categorical years
# Other categories created
d$FLOORS = factor(d$FLOORS)
d$ROOMS = factor(d$ROOMS)
d$FIREPLACE = factor(d$FIREPLACE)
d$REMODEL = factor(d$REMODEL)
# Let's use the categorical columns to create our model:
str(d)
## 'data.frame': 5801 obs. of 15 variables:
## $ TOTAL.VALUE : num 344 413 330 499 332 ...
## $ TAX : int 4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
## $ LOT.SQFT : int 9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
## $ YR.BUILT : Factor w/ 3 levels "18","19","20": 2 2 2 3 2 3 3 3 3 2 ...
## $ GROSS.AREA : int 2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
## $ LIVING.AREA : int 1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
## $ FLOORS : Factor w/ 5 levels "1","1.5","2",..: 3 3 3 1 3 1 3 1 1 3 ...
## $ ROOMS : Factor w/ 12 levels "3","4","5","6",..: 4 8 6 7 5 4 5 4 3 6 ...
## $ BEDROOMS : int 3 4 4 5 3 3 3 3 3 4 ...
## $ FULL.BATH : int 1 2 1 1 2 1 1 1 1 2 ...
## $ HALF.BATH : int 1 1 1 1 0 0 1 0 0 0 ...
## $ KITCHEN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FIREPLACE : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 2 1 2 1 1 2 1 ...
## $ REMODEL : Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
## $ TOTAL.VALUE.CAT: Factor w/ 2 levels "Expensive","Normal": 2 2 2 2 2 2 2 2 2 2 ...
# YR.BUILT, FLOORS , ROOMS, FIREPLACE, REMODEL, TOTAL.VALUE.CAT
selected.var <- c(4,7,8,13,14,15)
dim(d)
## [1] 5801 15
# Partition 60% to 40%
set.seed(123)
train.index <- sample(c(1:dim(d)[1]), dim(d)[1]*0.6)
train.df <- d[train.index, selected.var]
valid.df <- d[-train.index, selected.var]
# We will use the naiveBayes() from the e1071 library - install the package if you don't have it
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
d.nb = naiveBayes(TOTAL.VALUE.CAT ~ . , data = train.df)
d.nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Expensive Normal
## 0.0433908 0.9566092
##
## Conditional probabilities:
## YR.BUILT
## Y 18 19 20
## Expensive 0.013245033 0.735099338 0.251655629
## Normal 0.002703515 0.626614599 0.370681886
##
## FLOORS
## Y 1 1.5 2 2.5 3
## Expensive 0.02649007 0.02649007 0.78807947 0.15894040 0.00000000
## Normal 0.26524482 0.14268549 0.57885251 0.01321718 0.00000000
##
## ROOMS
## Y 3 4 5 6 7
## Expensive 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.1258278146
## Normal 0.0000000000 0.0126164013 0.1060378492 0.2997897266 0.3103033944
## ROOMS
## Y 8 9 10 11 12
## Expensive 0.1456953642 0.2715231788 0.2119205298 0.1258278146 0.0993377483
## Normal 0.1673175128 0.0615800541 0.0297386603 0.0078101532 0.0033042956
## ROOMS
## Y 13 14
## Expensive 0.0132450331 0.0066225166
## Normal 0.0009011715 0.0006007810
##
## FIREPLACE
## Y 0 1 2 3 4
## Expensive 0.119205298 0.549668874 0.291390728 0.026490066 0.013245033
## Normal 0.343947131 0.622108741 0.031541003 0.001802343 0.000600781
##
## REMODEL
## Y None Old Recent
## Expensive 0.5364238 0.1125828 0.3509934
## Normal 0.7557825 0.1030339 0.1411835
# Target's categories in the training (ie. A-priori probabilities) :
table(train.df$TOTAL.VALUE.CAT)/dim(train.df)[1]
##
## Expensive Normal
## 0.0433908 0.9566092
# Now let's apply naive Bayes model for prediction
# predict probabilities on the validation set
# type is "raw" meaning probability values is requested.
str(d$TOTAL.VALUE.CAT)
## Factor w/ 2 levels "Expensive","Normal": 2 2 2 2 2 2 2 2 2 2 ...
pred.prob = predict(d.nb, newdata = valid.df, type = "raw") # calculate probability of "Expensive" and "Normal"
pred.class = predict(d.nb, newdata = valid.df, type = "class") # Calculate predicted classes in the validation
results = data.frame(actual = valid.df$TOTAL.VALUE.CAT, predicted=pred.class, probability = pred.prob )
View(results)
# confusion matrix on the training partition
pred.class <- predict(d.nb, newdata = train.df,type = "class")
confusionMatrix(pred.class, train.df$TOTAL.VALUE.CAT, positive = "Expensive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Expensive Normal
## Expensive 65 59
## Normal 86 3270
##
## Accuracy : 0.9583
## 95% CI : (0.9512, 0.9647)
## No Information Rate : 0.9566
## P-Value [Acc > NIR] : 0.32724
##
## Kappa : 0.4513
##
## Mcnemar's Test P-Value : 0.03084
##
## Sensitivity : 0.43046
## Specificity : 0.98228
## Pos Pred Value : 0.52419
## Neg Pred Value : 0.97437
## Prevalence : 0.04339
## Detection Rate : 0.01868
## Detection Prevalence : 0.03563
## Balanced Accuracy : 0.70637
##
## 'Positive' Class : Expensive
##
# Accuracy : 0.9595
# Sensitivity : 0.44304
# Specificity : 0.98405
table(d$TOTAL.VALUE.CAT)/dim(d)[1]
##
## Expensive Normal
## 0.04188933 0.95811067
# confusion matrix on the validation partition
pred.class = predict(d.nb, newdata = valid.df, type = "class") # Calculate predicted classes in the validation
confusionMatrix(pred.class, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Expensive Normal
## Expensive 48 50
## Normal 44 2179
##
## Accuracy : 0.9595
## 95% CI : (0.9507, 0.9672)
## No Information Rate : 0.9604
## P-Value [Acc > NIR] : 0.6107
##
## Kappa : 0.4842
##
## Mcnemar's Test P-Value : 0.6061
##
## Sensitivity : 0.52174
## Specificity : 0.97757
## Pos Pred Value : 0.48980
## Neg Pred Value : 0.98021
## Prevalence : 0.03964
## Detection Rate : 0.02068
## Detection Prevalence : 0.04222
## Balanced Accuracy : 0.74965
##
## 'Positive' Class : Expensive
##
# Accuracy : 0.9599
# Sensitivity : 0.49412
# Specificity : 0.97764
# Changing the cutoff from 0.5 to 0.75
# confusion matrix on the validation partition using cutoff=0.75 for positive category
pred.prob = predict(d.nb, newdata = valid.df, type = "raw") # get propensities
pred.prob = data.frame(pred.prob)
pred.class75 = as.factor(ifelse(pred.prob$Expensive >= 0.75, "Expensive","Normal" ))
confusionMatrix(pred.class75, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Expensive Normal
## Expensive 31 27
## Normal 61 2202
##
## Accuracy : 0.9621
## 95% CI : (0.9535, 0.9695)
## No Information Rate : 0.9604
## P-Value [Acc > NIR] : 0.3600997
##
## Kappa : 0.3948
##
## Mcnemar's Test P-Value : 0.0004351
##
## Sensitivity : 0.33696
## Specificity : 0.98789
## Pos Pred Value : 0.53448
## Neg Pred Value : 0.97304
## Prevalence : 0.03964
## Detection Rate : 0.01336
## Detection Prevalence : 0.02499
## Balanced Accuracy : 0.66242
##
## 'Positive' Class : Expensive
##
# Changing the cutoff from 0.5 to 0.25
pred.class = as.factor(ifelse(pred.prob$Expensive >= 0.25, "Expensive","Normal" ))
confusionMatrix(pred.class, valid.df$TOTAL.VALUE.CAT, positive = "Expensive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Expensive Normal
## Expensive 58 93
## Normal 34 2136
##
## Accuracy : 0.9453
## 95% CI : (0.9352, 0.9542)
## No Information Rate : 0.9604
## P-Value [Acc > NIR] : 0.9998
##
## Kappa : 0.4503
##
## Mcnemar's Test P-Value : 2.652e-07
##
## Sensitivity : 0.63043
## Specificity : 0.95828
## Pos Pred Value : 0.38411
## Neg Pred Value : 0.98433
## Prevalence : 0.03964
## Detection Rate : 0.02499
## Detection Prevalence : 0.06506
## Balanced Accuracy : 0.79436
##
## 'Positive' Class : Expensive
##
# Gain/Lift Chart
# first option with 'caret' library:
library(caret)
pred.prob = predict(d.nb, newdata = valid.df, type = "raw") # get propensities
pred.prob = data.frame(pred.prob)
mylift <- lift(relevel(valid.df$TOTAL.VALUE.CAT, ref="Expensive") ~ pred.prob$Expensive)
xyplot(mylift, plot = "gain")

# ROC
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
results = data.frame(actual = valid.df$TOTAL.VALUE.CAT, predicted=pred.class, probability = pred.prob )
myroc <- roc(results$actual , results$probability.Expensive, levels = c("Expensive","Normal"))
## Setting direction: controls > cases
plot(myroc, main = "ROC curve for the model",
col = "blue", lwd = 2, legacy.axes = TRUE) # if add = TRUE, you can add more charts on top of each

auc(myroc)
## Area under the curve: 0.9396