data <- read.csv("calonpembelimobil.csv", sep = ",")
str(data)
## 'data.frame': 1000 obs. of 7 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Usia : int 32 49 52 26 45 39 38 29 30 51 ...
## $ Status : int 1 2 1 2 3 2 2 1 2 1 ...
## $ Kelamin : int 0 1 0 1 0 0 1 1 0 1 ...
## $ Memiliki_Mobil: int 0 1 2 1 2 1 0 0 0 0 ...
## $ Penghasilan : int 240 100 250 130 237 280 150 143 200 174 ...
## $ Beli_Mobil : int 1 0 1 0 1 1 0 0 1 0 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(ggplot2)
library(mice)
## Warning: package 'mice' was built under R version 4.4.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
# Buat label beli
data <- data %>%
mutate(beli = ifelse(Beli_Mobil == 1 , "beli", "tidak"),
beli = as.factor(beli))
summary(data)
## ID Usia Status Kelamin
## Min. : 1.0 Min. : 24.00 Min. :0.000 Min. :0.000
## 1st Qu.: 250.8 1st Qu.: 33.00 1st Qu.:1.000 1st Qu.:0.000
## Median : 500.5 Median : 43.00 Median :1.000 Median :0.000
## Mean : 500.5 Mean : 43.53 Mean :1.469 Mean :0.481
## 3rd Qu.: 750.2 3rd Qu.: 53.00 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :1000.0 Max. :164.00 Max. :3.000 Max. :1.000
## Memiliki_Mobil Penghasilan Beli_Mobil beli
## Min. :0.000 Min. : 95.0 Min. :0.000 beli :633
## 1st Qu.:0.000 1st Qu.:187.0 1st Qu.:0.000 tidak:367
## Median :1.000 Median :258.5 Median :1.000
## Mean :0.952 Mean :270.1 Mean :0.633
## 3rd Qu.:2.000 3rd Qu.:352.2 3rd Qu.:1.000
## Max. :4.000 Max. :490.0 Max. :1.000
table(data$beli)
##
## beli tidak
## 633 367
set.seed(123)
split <- sample.split(data$beli, SplitRatio = 0.8)
training_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)
dim(training_set)
## [1] 800 8
dim(training_set)
## [1] 800 8
dim(test_set)
## [1] 200 8
topredict_set<-test_set[2:8]
dim(topredict_set)
## [1] 200 7
summary(data)
## ID Usia Status Kelamin
## Min. : 1.0 Min. : 24.00 Min. :0.000 Min. :0.000
## 1st Qu.: 250.8 1st Qu.: 33.00 1st Qu.:1.000 1st Qu.:0.000
## Median : 500.5 Median : 43.00 Median :1.000 Median :0.000
## Mean : 500.5 Mean : 43.53 Mean :1.469 Mean :0.481
## 3rd Qu.: 750.2 3rd Qu.: 53.00 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :1000.0 Max. :164.00 Max. :3.000 Max. :1.000
## Memiliki_Mobil Penghasilan Beli_Mobil beli
## Min. :0.000 Min. : 95.0 Min. :0.000 beli :633
## 1st Qu.:0.000 1st Qu.:187.0 1st Qu.:0.000 tidak:367
## Median :1.000 Median :258.5 Median :1.000
## Mean :0.952 Mean :270.1 Mean :0.633
## 3rd Qu.:2.000 3rd Qu.:352.2 3rd Qu.:1.000
## Max. :4.000 Max. :490.0 Max. :1.000
model_nb <- naiveBayes(beli ~ Usia + Status + Kelamin + Memiliki_Mobil + Penghasilan, data = training_set)
summary(model_nb)
## Length Class Mode
## apriori 2 table numeric
## tables 5 -none- list
## levels 2 -none- character
## isnumeric 5 -none- logical
## call 4 -none- call
prediksi <- predict(model_nb, newdata = test_set)
head(prediksi)
## [1] beli tidak tidak tidak tidak tidak
## Levels: beli tidak
confusionMatrix(prediksi, test_set$beli)
## Confusion Matrix and Statistics
##
## Reference
## Prediction beli tidak
## beli 113 7
## tidak 14 66
##
## Accuracy : 0.895
## 95% CI : (0.844, 0.9338)
## No Information Rate : 0.635
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.778
##
## Mcnemar's Test P-Value : 0.1904
##
## Sensitivity : 0.8898
## Specificity : 0.9041
## Pos Pred Value : 0.9417
## Neg Pred Value : 0.8250
## Prevalence : 0.6350
## Detection Rate : 0.5650
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8969
##
## 'Positive' Class : beli
##