# clear-up the environment
rm(list = ls())
# chunk options
knitr::opts_chunk$set(
message = FALSE,
fig.align = "center",
comment = "#>")
options(scipen = 9999)
# library
library(tidyverse)## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.0 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.1 v tibble 3.1.8
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(plotly)##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(data.table)##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidymodels)## -- Attaching packages -------------------------------------- tidymodels 1.0.0 --
## v broom 1.0.3 v rsample 1.1.1
## v dials 1.1.0 v tune 1.0.1
## v infer 1.0.4 v workflows 1.1.3
## v modeldata 1.1.0 v workflowsets 1.0.0
## v parsnip 1.0.4 v yardstick 1.1.0
## v recipes 1.0.5
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between() masks dplyr::between()
## x scales::discard() masks purrr::discard()
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x data.table::first() masks dplyr::first()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x caret::lift() masks purrr::lift()
## x yardstick::precision() masks caret::precision()
## x yardstick::recall() masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec() masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step() masks stats::step()
## x data.table::transpose() masks purrr::transpose()
## * Search for functions across packages at https://www.tidymodels.org/find/
library(car)## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(scales)
library(lmtest)## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)store <- read.csv("data_input/superstore_data.csv")
glimpse(store)#> Rows: 2,240
#> Columns: 22
#> $ Id <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
anyNA(store)#> [1] TRUE
colSums(is.na(x = store))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 24 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
Store <- store %>% filter(!is.na(Income))
colSums(is.na(x = Store))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 0 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
store_Clean <- store %>%
mutate(Dt_Customer = mdy(store$Dt_Customer)) %>%
mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)
# cek tipe data setelah diubah
str(store_Clean)#> 'data.frame': 2240 obs. of 22 variables:
#> $ Id : int 1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
#> $ Year_Birth : int 1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
#> $ Education : Factor w/ 5 levels "2n Cycle","Basic",..: 3 3 3 3 3 5 1 3 5 5 ...
#> $ Marital_Status : Factor w/ 8 levels "Absurd","Alone",..: 3 5 4 6 5 5 4 6 4 4 ...
#> $ Income : int 84835 57091 67267 32474 21474 71691 63564 44931 65324 65324 ...
#> $ Kidhome : Factor w/ 3 levels "0","1","2": 1 1 1 2 2 1 1 1 1 1 ...
#> $ Teenhome : Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 2 2 2 ...
#> $ Dt_Customer : Date, format: "2014-06-16" "2014-06-15" ...
#> $ Recency : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ MntWines : int 189 464 134 10 6 336 769 78 384 384 ...
#> $ MntFruits : int 104 5 11 0 16 130 80 0 0 0 ...
#> $ MntMeatProducts : int 379 64 59 1 24 411 252 11 102 102 ...
#> $ MntFishProducts : int 111 7 15 0 11 240 15 0 21 21 ...
#> $ MntSweetProducts : int 189 0 2 0 0 32 34 0 32 32 ...
#> $ MntGoldProds : int 218 37 30 0 34 43 65 7 5 5 ...
#> $ NumDealsPurchases : int 1 1 1 1 2 1 1 1 3 3 ...
#> $ NumWebPurchases : int 4 7 3 1 3 4 10 2 6 6 ...
#> $ NumCatalogPurchases: int 4 3 2 0 1 7 10 1 2 2 ...
#> $ NumStorePurchases : int 6 7 5 2 2 5 7 3 9 9 ...
#> $ NumWebVisitsMonth : int 1 5 2 7 7 2 6 5 4 4 ...
#> $ Response : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 1 1 1 ...
#> $ Complain : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
Business Question/problem sederhana: Ingin memprediksi pembeli akan complain atau tidak berdasarkan status Penikahan nya
library(e1071)
model_bayes <- naiveBayes(formula = Complain ~ Marital_Status ,
data = store_Clean)
model_bayes#>
#> Naive Bayes Classifier for Discrete Predictors
#>
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#>
#> A-priori probabilities:
#> Y
#> 0 1
#> 0.990625 0.009375
#>
#> Conditional probabilities:
#> Marital_Status
#> Y Absurd Alone Divorced Married Single
#> 0 0.0009013069 0.0013519603 0.1036502929 0.3857593511 0.2136097341
#> 1 0.0000000000 0.0000000000 0.0952380952 0.3809523810 0.2857142857
#> Marital_Status
#> Y Together Widow YOLO
#> 0 0.2591257323 0.0347003155 0.0009013069
#> 1 0.2380952381 0.0000000000 0.0000000000
table(store_Clean$Complain)#>
#> 0 1
#> 2219 21
table(Y = store_Clean$Complain, Marital_Status = store_Clean$Marital_Status) %>%
prop.table(margin = 2)#> Marital_Status
#> Y Absurd Alone Divorced Married Single Together
#> 0 1.000000000 1.000000000 0.991379310 0.990740741 0.987500000 0.991379310
#> 1 0.000000000 0.000000000 0.008620690 0.009259259 0.012500000 0.008620690
#> Marital_Status
#> Y Widow YOLO
#> 0 1.000000000 1.000000000
#> 1 0.000000000 0.000000000
predict(model_bayes,
newdata = data.frame(Marital_Status = "Married"),
type = "raw")#> 0 1
#> [1,] 0.9907407 0.009259259
#RNGkind(sample.kind = "Rounding")
set.seed(100)
split_store <- sample(nrow(store_Clean), nrow(store_Clean)*0.80)
store_train <- store_Clean[split_store, ]
store_test <- store_Clean[-split_store, ]store_bayes <- naiveBayes(formula = Marital_Status ~.,
data = store_train,
laplace = 1)store_pred <- predict(store_bayes,
newdata = store_test,
type = "class") confusionMatrix(data = store_pred,
reference = store_test$Marital_Status,
positive = "Married")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Absurd Alone Divorced Married Single Together Widow YOLO
#> Absurd 0 0 0 1 0 1 0 0
#> Alone 0 0 18 69 38 50 5 1
#> Divorced 0 0 2 16 4 10 4 0
#> Married 0 0 6 25 14 24 2 0
#> Single 0 0 1 10 9 11 1 0
#> Together 0 0 1 6 3 9 1 0
#> Widow 0 0 13 38 22 29 4 0
#> YOLO 0 0 0 0 0 0 0 0
#>
#> Overall Statistics
#>
#> Accuracy : 0.1094
#> 95% CI : (0.082, 0.142)
#> No Information Rate : 0.3683
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.0078
#>
#> Mcnemar's Test P-Value : NA
#>
#> Statistics by Class:
#>
#> Class: Absurd Class: Alone Class: Divorced Class: Married
#> Sensitivity NA NA 0.048780 0.1515
#> Specificity 0.995536 0.596 0.916462 0.8375
#> Pos Pred Value NA NA 0.055556 0.3521
#> Neg Pred Value NA NA 0.905340 0.6286
#> Prevalence 0.000000 0.000 0.091518 0.3683
#> Detection Rate 0.000000 0.000 0.004464 0.0558
#> Detection Prevalence 0.004464 0.404 0.080357 0.1585
#> Balanced Accuracy NA NA 0.482621 0.4945
#> Class: Single Class: Together Class: Widow Class: YOLO
#> Sensitivity 0.10000 0.06716 0.235294 0.000000
#> Specificity 0.93575 0.96497 0.763341 1.000000
#> Pos Pred Value 0.28125 0.45000 0.037736 NaN
#> Neg Pred Value 0.80529 0.70794 0.961988 0.997768
#> Prevalence 0.20089 0.29911 0.037946 0.002232
#> Detection Rate 0.02009 0.02009 0.008929 0.000000
#> Detection Prevalence 0.07143 0.04464 0.236607 0.000000
#> Balanced Accuracy 0.51788 0.51607 0.499318 0.500000
RNGkind(sample.kind = "Rounding")#> Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
#> used
set.seed(13)
votes_train_up <- upSample(x = store_train %>% select(-Marital_Status),
y = store_train$Marital_Status,
yname = "Marital_Status")
votes_train_upvotes_bayes_up <- naiveBayes(Marital_Status~., data = store_test, laplace = 1)
votes_predict_up <- predict(store_bayes, store_test)
confusionMatrix(store_pred,
store_test$Marital_Status,
positive = "Married")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Absurd Alone Divorced Married Single Together Widow YOLO
#> Absurd 0 0 0 1 0 1 0 0
#> Alone 0 0 18 69 38 50 5 1
#> Divorced 0 0 2 16 4 10 4 0
#> Married 0 0 6 25 14 24 2 0
#> Single 0 0 1 10 9 11 1 0
#> Together 0 0 1 6 3 9 1 0
#> Widow 0 0 13 38 22 29 4 0
#> YOLO 0 0 0 0 0 0 0 0
#>
#> Overall Statistics
#>
#> Accuracy : 0.1094
#> 95% CI : (0.082, 0.142)
#> No Information Rate : 0.3683
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.0078
#>
#> Mcnemar's Test P-Value : NA
#>
#> Statistics by Class:
#>
#> Class: Absurd Class: Alone Class: Divorced Class: Married
#> Sensitivity NA NA 0.048780 0.1515
#> Specificity 0.995536 0.596 0.916462 0.8375
#> Pos Pred Value NA NA 0.055556 0.3521
#> Neg Pred Value NA NA 0.905340 0.6286
#> Prevalence 0.000000 0.000 0.091518 0.3683
#> Detection Rate 0.000000 0.000 0.004464 0.0558
#> Detection Prevalence 0.004464 0.404 0.080357 0.1585
#> Balanced Accuracy NA NA 0.482621 0.4945
#> Class: Single Class: Together Class: Widow Class: YOLO
#> Sensitivity 0.10000 0.06716 0.235294 0.000000
#> Specificity 0.93575 0.96497 0.763341 1.000000
#> Pos Pred Value 0.28125 0.45000 0.037736 NaN
#> Neg Pred Value 0.80529 0.70794 0.961988 0.997768
#> Prevalence 0.20089 0.29911 0.037946 0.002232
#> Detection Rate 0.02009 0.02009 0.008929 0.000000
#> Detection Prevalence 0.07143 0.04464 0.236607 0.000000
#> Balanced Accuracy 0.51788 0.51607 0.499318 0.500000
Accuracy di data tidak diupsampling: 0.1183
Accuracy di data diupsampling: 0.1183
predict(model_bayes,
newdata = data.frame(Complain = "1"),
type = "raw")#> 0 1
#> [1,] 0.990625 0.009375
#RNGkind(sample.kind = "Rounding")
set.seed(100)
store_train <- downSample(x = store_train %>% select(-Complain), # mengambil data prediktor saja
y = store_train$Complain, # mengambil data target
yname = "Complain")store_train$Complain %>%
table() %>%
prop.table()#> .
#> 0 1
#> 0.5 0.5
library(partykit)
store_tree <- ctree(formula = Complain ~ Education + Kidhome+ Teenhome+ Response + Marital_Status ,
data = store_train)
plot(store_tree,
type = "simple") store_tree_complex <- ctree( Complain ~ Education + Kidhome+ Teenhome+ Response + Marital_Status, store_train,
control = ctree_control(mincriterion = 0.001,
minsplit = 0,
minbucket = 0))
plot(store_tree_complex, type = "simple")