# clear-up the environment
rm(list = ls())

# chunk options
knitr::opts_chunk$set(
  message = FALSE,
  fig.align = "center",
  comment = "#>")
options(scipen = 9999)

# library
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.0     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.1     v tibble    3.1.8
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 1.0.0 --
## v broom        1.0.3     v rsample      1.1.1
## v dials        1.1.0     v tune         1.0.1
## v infer        1.0.4     v workflows    1.1.3
## v modeldata    1.1.0     v workflowsets 1.0.0
## v parsnip      1.0.4     v yardstick    1.1.0
## v recipes      1.0.5     
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between()    masks dplyr::between()
## x scales::discard()        masks purrr::discard()
## x plotly::filter()         masks dplyr::filter(), stats::filter()
## x data.table::first()      masks dplyr::first()
## x recipes::fixed()         masks stringr::fixed()
## x dplyr::lag()             masks stats::lag()
## x data.table::last()       masks dplyr::last()
## x caret::lift()            masks purrr::lift()
## x yardstick::precision()   masks caret::precision()
## x yardstick::recall()      masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec()        masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step()          masks stats::step()
## x data.table::transpose()  masks purrr::transpose()
## * Search for functions across packages at https://www.tidymodels.org/find/
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(scales)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)

Baca Data

store <- read.csv("data_input/superstore_data.csv")
glimpse(store)
#> Rows: 2,240
#> Columns: 22
#> $ Id                  <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth          <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education           <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status      <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income              <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome             <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome            <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer         <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines            <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits           <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts     <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts     <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts    <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds        <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases   <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases     <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases   <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth   <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response            <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~

Check Missing Value

anyNA(store)
#> [1] TRUE
colSums(is.na(x = store))
#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                  24                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0
Store <- store %>% filter(!is.na(Income))
colSums(is.na(x = Store))
#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                   0                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0

Data Wrangling

store_Clean <- store %>% 
  mutate(Dt_Customer = mdy(store$Dt_Customer)) %>% 
  mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)
# cek tipe data setelah diubah
str(store_Clean)
#> 'data.frame':    2240 obs. of  22 variables:
#>  $ Id                 : int  1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
#>  $ Year_Birth         : int  1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
#>  $ Education          : Factor w/ 5 levels "2n Cycle","Basic",..: 3 3 3 3 3 5 1 3 5 5 ...
#>  $ Marital_Status     : Factor w/ 8 levels "Absurd","Alone",..: 3 5 4 6 5 5 4 6 4 4 ...
#>  $ Income             : int  84835 57091 67267 32474 21474 71691 63564 44931 65324 65324 ...
#>  $ Kidhome            : Factor w/ 3 levels "0","1","2": 1 1 1 2 2 1 1 1 1 1 ...
#>  $ Teenhome           : Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 2 2 2 ...
#>  $ Dt_Customer        : Date, format: "2014-06-16" "2014-06-15" ...
#>  $ Recency            : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ MntWines           : int  189 464 134 10 6 336 769 78 384 384 ...
#>  $ MntFruits          : int  104 5 11 0 16 130 80 0 0 0 ...
#>  $ MntMeatProducts    : int  379 64 59 1 24 411 252 11 102 102 ...
#>  $ MntFishProducts    : int  111 7 15 0 11 240 15 0 21 21 ...
#>  $ MntSweetProducts   : int  189 0 2 0 0 32 34 0 32 32 ...
#>  $ MntGoldProds       : int  218 37 30 0 34 43 65 7 5 5 ...
#>  $ NumDealsPurchases  : int  1 1 1 1 2 1 1 1 3 3 ...
#>  $ NumWebPurchases    : int  4 7 3 1 3 4 10 2 6 6 ...
#>  $ NumCatalogPurchases: int  4 3 2 0 1 7 10 1 2 2 ...
#>  $ NumStorePurchases  : int  6 7 5 2 2 5 7 3 9 9 ...
#>  $ NumWebVisitsMonth  : int  1 5 2 7 7 2 6 5 4 4 ...
#>  $ Response           : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 1 1 1 ...
#>  $ Complain           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

Prediktor

Business Question/problem sederhana: Ingin memprediksi pembeli akan complain atau tidak berdasarkan status Penikahan nya

library(e1071)
model_bayes <- naiveBayes(formula = Complain  ~ Marital_Status ,
                          data = store_Clean)
model_bayes
#> 
#> Naive Bayes Classifier for Discrete Predictors
#> 
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#> 
#> A-priori probabilities:
#> Y
#>        0        1 
#> 0.990625 0.009375 
#> 
#> Conditional probabilities:
#>    Marital_Status
#> Y         Absurd        Alone     Divorced      Married       Single
#>   0 0.0009013069 0.0013519603 0.1036502929 0.3857593511 0.2136097341
#>   1 0.0000000000 0.0000000000 0.0952380952 0.3809523810 0.2857142857
#>    Marital_Status
#> Y       Together        Widow         YOLO
#>   0 0.2591257323 0.0347003155 0.0009013069
#>   1 0.2380952381 0.0000000000 0.0000000000
table(store_Clean$Complain)
#> 
#>    0    1 
#> 2219   21

Conditional probabilities

table(Y = store_Clean$Complain, Marital_Status = store_Clean$Marital_Status) %>% 
   prop.table(margin = 2)
#>    Marital_Status
#> Y        Absurd       Alone    Divorced     Married      Single    Together
#>   0 1.000000000 1.000000000 0.991379310 0.990740741 0.987500000 0.991379310
#>   1 0.000000000 0.000000000 0.008620690 0.009259259 0.012500000 0.008620690
#>    Marital_Status
#> Y         Widow        YOLO
#>   0 1.000000000 1.000000000
#>   1 0.000000000 0.000000000

Predict

predict(model_bayes, 
        newdata = data.frame(Marital_Status = "Married"), 
        type = "raw")
#>              0           1
#> [1,] 0.9907407 0.009259259

Crosvalidation

#RNGkind(sample.kind = "Rounding")  
set.seed(100) 
split_store <- sample(nrow(store_Clean), nrow(store_Clean)*0.80)
store_train <- store_Clean[split_store, ] 
store_test <- store_Clean[-split_store, ]

laplace

store_bayes <- naiveBayes(formula = Marital_Status ~.,
                          data = store_train,
                          laplace = 1)

predict data test

store_pred <- predict(store_bayes, 
                      newdata = store_test, 
                      type = "class") 

Evaluasi confusion matrix

confusionMatrix(data = store_pred,
                reference = store_test$Marital_Status,
                positive = "Married")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction Absurd Alone Divorced Married Single Together Widow YOLO
#>   Absurd        0     0        0       1      0        1     0    0
#>   Alone         0     0       18      69     38       50     5    1
#>   Divorced      0     0        2      16      4       10     4    0
#>   Married       0     0        6      25     14       24     2    0
#>   Single        0     0        1      10      9       11     1    0
#>   Together      0     0        1       6      3        9     1    0
#>   Widow         0     0       13      38     22       29     4    0
#>   YOLO          0     0        0       0      0        0     0    0
#> 
#> Overall Statistics
#>                                         
#>                Accuracy : 0.1094        
#>                  95% CI : (0.082, 0.142)
#>     No Information Rate : 0.3683        
#>     P-Value [Acc > NIR] : 1             
#>                                         
#>                   Kappa : 0.0078        
#>                                         
#>  Mcnemar's Test P-Value : NA            
#> 
#> Statistics by Class:
#> 
#>                      Class: Absurd Class: Alone Class: Divorced Class: Married
#> Sensitivity                     NA           NA        0.048780         0.1515
#> Specificity               0.995536        0.596        0.916462         0.8375
#> Pos Pred Value                  NA           NA        0.055556         0.3521
#> Neg Pred Value                  NA           NA        0.905340         0.6286
#> Prevalence                0.000000        0.000        0.091518         0.3683
#> Detection Rate            0.000000        0.000        0.004464         0.0558
#> Detection Prevalence      0.004464        0.404        0.080357         0.1585
#> Balanced Accuracy               NA           NA        0.482621         0.4945
#>                      Class: Single Class: Together Class: Widow Class: YOLO
#> Sensitivity                0.10000         0.06716     0.235294    0.000000
#> Specificity                0.93575         0.96497     0.763341    1.000000
#> Pos Pred Value             0.28125         0.45000     0.037736         NaN
#> Neg Pred Value             0.80529         0.70794     0.961988    0.997768
#> Prevalence                 0.20089         0.29911     0.037946    0.002232
#> Detection Rate             0.02009         0.02009     0.008929    0.000000
#> Detection Prevalence       0.07143         0.04464     0.236607    0.000000
#> Balanced Accuracy          0.51788         0.51607     0.499318    0.500000

Upsampling dan Downsampling

RNGkind(sample.kind = "Rounding")
#> Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
#> used
set.seed(13)

votes_train_up <- upSample(x = store_train %>% select(-Marital_Status), 
                           y = store_train$Marital_Status, 
                           yname = "Marital_Status")
votes_train_up

Upsamling buat ulang model

votes_bayes_up <- naiveBayes(Marital_Status~., data = store_test, laplace = 1)

votes_predict_up <- predict(store_bayes, store_test)

confusionMatrix(store_pred, 
                store_test$Marital_Status, 
                positive = "Married")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction Absurd Alone Divorced Married Single Together Widow YOLO
#>   Absurd        0     0        0       1      0        1     0    0
#>   Alone         0     0       18      69     38       50     5    1
#>   Divorced      0     0        2      16      4       10     4    0
#>   Married       0     0        6      25     14       24     2    0
#>   Single        0     0        1      10      9       11     1    0
#>   Together      0     0        1       6      3        9     1    0
#>   Widow         0     0       13      38     22       29     4    0
#>   YOLO          0     0        0       0      0        0     0    0
#> 
#> Overall Statistics
#>                                         
#>                Accuracy : 0.1094        
#>                  95% CI : (0.082, 0.142)
#>     No Information Rate : 0.3683        
#>     P-Value [Acc > NIR] : 1             
#>                                         
#>                   Kappa : 0.0078        
#>                                         
#>  Mcnemar's Test P-Value : NA            
#> 
#> Statistics by Class:
#> 
#>                      Class: Absurd Class: Alone Class: Divorced Class: Married
#> Sensitivity                     NA           NA        0.048780         0.1515
#> Specificity               0.995536        0.596        0.916462         0.8375
#> Pos Pred Value                  NA           NA        0.055556         0.3521
#> Neg Pred Value                  NA           NA        0.905340         0.6286
#> Prevalence                0.000000        0.000        0.091518         0.3683
#> Detection Rate            0.000000        0.000        0.004464         0.0558
#> Detection Prevalence      0.004464        0.404        0.080357         0.1585
#> Balanced Accuracy               NA           NA        0.482621         0.4945
#>                      Class: Single Class: Together Class: Widow Class: YOLO
#> Sensitivity                0.10000         0.06716     0.235294    0.000000
#> Specificity                0.93575         0.96497     0.763341    1.000000
#> Pos Pred Value             0.28125         0.45000     0.037736         NaN
#> Neg Pred Value             0.80529         0.70794     0.961988    0.997768
#> Prevalence                 0.20089         0.29911     0.037946    0.002232
#> Detection Rate             0.02009         0.02009     0.008929    0.000000
#> Detection Prevalence       0.07143         0.04464     0.236607    0.000000
#> Balanced Accuracy          0.51788         0.51607     0.499318    0.500000

Accuracy di data tidak diupsampling: 0.1183
Accuracy di data diupsampling: 0.1183

Pembuatan Model

Model Bayes

predict(model_bayes, 
        newdata = data.frame(Complain = "1"), 
        type = "raw")
#>             0        1
#> [1,] 0.990625 0.009375
#RNGkind(sample.kind = "Rounding")
set.seed(100)

store_train <- downSample(x = store_train %>% select(-Complain), # mengambil data prediktor saja
                         y = store_train$Complain, # mengambil data target
                         yname = "Complain")
store_train$Complain %>% 
   table() %>% 
   prop.table()
#> .
#>   0   1 
#> 0.5 0.5

Decision Tree

library(partykit)
store_tree <- ctree(formula = Complain ~ Education + Kidhome+ Teenhome+ Response + Marital_Status ,
                   data = store_train)
plot(store_tree, 
     type = "simple") 

Random Forest

store_tree_complex <- ctree( Complain ~ Education + Kidhome+ Teenhome+ Response + Marital_Status, store_train,
                               control = ctree_control(mincriterion = 0.001, 
                                            minsplit = 0,
                                            minbucket = 0))

plot(store_tree_complex, type = "simple")