# clear-up the environment
rm(list = ls())

# chunk options
knitr::opts_chunk$set(
  message = FALSE,
  fig.align = "center",
  comment = "#>")
options(scipen = 9999)

# library
library(tidyverse)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.0     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.1     v tibble    3.1.8
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 1.0.0 --
## v broom        1.0.3     v rsample      1.1.1
## v dials        1.1.0     v tune         1.0.1
## v infer        1.0.4     v workflows    1.1.3
## v modeldata    1.1.0     v workflowsets 1.0.0
## v parsnip      1.0.4     v yardstick    1.1.0
## v recipes      1.0.5     
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between()    masks dplyr::between()
## x scales::discard()        masks purrr::discard()
## x plotly::filter()         masks dplyr::filter(), stats::filter()
## x data.table::first()      masks dplyr::first()
## x recipes::fixed()         masks stringr::fixed()
## x dplyr::lag()             masks stats::lag()
## x data.table::last()       masks dplyr::last()
## x caret::lift()            masks purrr::lift()
## x yardstick::precision()   masks caret::precision()
## x yardstick::recall()      masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec()        masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step()          masks stats::step()
## x data.table::transpose()  masks purrr::transpose()
## * Use tidymodels_prefer() to resolve common conflicts.

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(scales)
library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(dplyr)

Baca Data

store <- read.csv("data_input/superstore_data.csv")
glimpse(store)

#> Rows: 2,240
#> Columns: 22
#> $ Id                  <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth          <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education           <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status      <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income              <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome             <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome            <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer         <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines            <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits           <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts     <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts     <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts    <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds        <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases   <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases     <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases   <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth   <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response            <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~

unique(store$Complain)

#> [1] 0 1

Check Missing Value

anyNA(store)

#> [1] TRUE

colSums(is.na(x = store))

#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                  24                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0

Store <- store %>% filter(!is.na(Income))
colSums(is.na(x = Store))

#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                   0                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0

Data Wrangling

store_Clean <- store %>% 
  mutate(Dt_Customer = mdy(store$Dt_Customer)) %>% 
  mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)
# cek tipe data setelah diubah
str(store_Clean)

#> 'data.frame':    2240 obs. of  22 variables:
#>  $ Id                 : int  1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
#>  $ Year_Birth         : int  1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
#>  $ Education          : Factor w/ 5 levels "2n Cycle","Basic",..: 3 3 3 3 3 5 1 3 5 5 ...
#>  $ Marital_Status     : Factor w/ 8 levels "Absurd","Alone",..: 3 5 4 6 5 5 4 6 4 4 ...
#>  $ Income             : int  84835 57091 67267 32474 21474 71691 63564 44931 65324 65324 ...
#>  $ Kidhome            : Factor w/ 3 levels "0","1","2": 1 1 1 2 2 1 1 1 1 1 ...
#>  $ Teenhome           : Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 2 2 2 ...
#>  $ Dt_Customer        : Date, format: "2014-06-16" "2014-06-15" ...
#>  $ Recency            : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ MntWines           : int  189 464 134 10 6 336 769 78 384 384 ...
#>  $ MntFruits          : int  104 5 11 0 16 130 80 0 0 0 ...
#>  $ MntMeatProducts    : int  379 64 59 1 24 411 252 11 102 102 ...
#>  $ MntFishProducts    : int  111 7 15 0 11 240 15 0 21 21 ...
#>  $ MntSweetProducts   : int  189 0 2 0 0 32 34 0 32 32 ...
#>  $ MntGoldProds       : int  218 37 30 0 34 43 65 7 5 5 ...
#>  $ NumDealsPurchases  : int  1 1 1 1 2 1 1 1 3 3 ...
#>  $ NumWebPurchases    : int  4 7 3 1 3 4 10 2 6 6 ...
#>  $ NumCatalogPurchases: int  4 3 2 0 1 7 10 1 2 2 ...
#>  $ NumStorePurchases  : int  6 7 5 2 2 5 7 3 9 9 ...
#>  $ NumWebVisitsMonth  : int  1 5 2 7 7 2 6 5 4 4 ...
#>  $ Response           : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 1 1 1 ...
#>  $ Complain           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

Build Model

Tanpa Prediktor

model_null <- glm(
  formula = Kidhome ~1,
  data = store_Clean,
  family = "binomial"
)
summary(model_null)

#> 
#> Call:
#> glm(formula = Kidhome ~ 1, family = "binomial", data = store_Clean)
#> 
#> Deviance Residuals: 
#>    Min      1Q  Median      3Q     Max  
#> -1.048  -1.048  -1.048   1.312   1.312  
#> 
#> Coefficients:
#>             Estimate Std. Error z value          Pr(>|z|)    
#> (Intercept) -0.31142    0.04277  -7.281 0.000000000000331 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 3051.6  on 2239  degrees of freedom
#> Residual deviance: 3051.6  on 2239  degrees of freedom
#> AIC: 3053.6
#> 
#> Number of Fisher Scoring iterations: 4

# probability
table(store_Clean$Kidhome)

#> 
#>    0    1    2 
#> 1293  899   48

ggcorr(store_Clean, label = T)

#> Warning in ggcorr(store_Clean, label = T): data in column(s) 'Education',
#> 'Marital_Status', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Response', 'Complain'
#> are not numeric and were ignored

Cross Validation

#RNGkind(sample.kind = "Rounding")  
set.seed(147) 

index <- sample(x = nrow(store_Clean), size = nrow(store_Clean)*0.8)

store_train <- store_Clean[index,]
store_test <- store_Clean[-index,]

Modeling

model semua Prediktor

model_all <- glm(
  formula = Kidhome ~.,
  data = store_train,
  family = "binomial",
  control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)

summary(model_all)

#> 
#> Call:
#> glm(formula = Kidhome ~ ., family = "binomial", data = store_train, 
#>     control = list(trace = FALSE))
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -5.7269  -0.3463  -0.0577   0.5089   3.2178  
#> 
#> Coefficients:
#>                              Estimate     Std. Error z value
#> (Intercept)            -126.090853198  934.176003698  -0.135
#> Id                       -0.000006528    0.000022899  -0.285
#> Year_Birth                0.051598645    0.008098314   6.372
#> EducationBasic           -0.539361619    0.448277227  -1.203
#> EducationGraduation       0.426501181    0.275932928   1.546
#> EducationMaster           0.410366550    0.318140954   1.290
#> EducationPhD              0.295361858    0.314886074   0.938
#> Marital_StatusAlone      26.369082507 1167.302836920   0.023
#> Marital_StatusDivorced   11.887591719  934.021779576   0.013
#> Marital_StatusMarried    11.798742654  934.021757717   0.013
#> Marital_StatusSingle     11.643692285  934.021755511   0.012
#> Marital_StatusTogether   12.003778521  934.021753355   0.013
#> Marital_StatusWidow      11.573151463  934.021837338   0.012
#> Marital_StatusYOLO       -4.718215098 1729.329001586  -0.003
#> Income                    0.000036463    0.000008683   4.199
#> Teenhome1                -1.361870571    0.213149369  -6.389
#> Teenhome2                -1.115873563    0.513540170  -2.173
#> Dt_Customer               0.000726071    0.000369960   1.963
#> Recency                   0.000230072    0.002673722   0.086
#> MntWines                 -0.000910599    0.000529870  -1.719
#> MntFruits                 0.004701890    0.004329529   1.086
#> MntMeatProducts          -0.002381762    0.001169429  -2.037
#> MntFishProducts          -0.001141437    0.003452625  -0.331
#> MntSweetProducts         -0.006117263    0.004756681  -1.286
#> MntGoldProds             -0.009497921    0.002708254  -3.507
#> NumDealsPurchases         0.823886625    0.066846047  12.325
#> NumWebPurchases          -0.264167679    0.051977942  -5.082
#> NumCatalogPurchases      -0.432025030    0.077042374  -5.608
#> NumStorePurchases        -0.226591710    0.052426427  -4.322
#> NumWebVisitsMonth         0.216755329    0.052886110   4.099
#> Response1                 0.208398222    0.279832098   0.745
#> Complain1                 0.940319776    0.909165920   1.034
#>                                    Pr(>|z|)    
#> (Intercept)                        0.892631    
#> Id                                 0.775592    
#> Year_Birth                   0.000000000187 ***
#> EducationBasic                     0.228904    
#> EducationGraduation                0.122184    
#> EducationMaster                    0.197089    
#> EducationPhD                       0.348246    
#> Marital_StatusAlone                0.981978    
#> Marital_StatusDivorced             0.989845    
#> Marital_StatusMarried              0.989921    
#> Marital_StatusSingle               0.990054    
#> Marital_StatusTogether             0.989746    
#> Marital_StatusWidow                0.990114    
#> Marital_StatusYOLO                 0.997823    
#> Income                       0.000026770272 ***
#> Teenhome1                    0.000000000167 ***
#> Teenhome2                          0.029788 *  
#> Dt_Customer                        0.049697 *  
#> Recency                            0.931427    
#> MntWines                           0.085700 .  
#> MntFruits                          0.277477    
#> MntMeatProducts                    0.041681 *  
#> MntFishProducts                    0.740947    
#> MntSweetProducts                   0.198431    
#> MntGoldProds                       0.000453 ***
#> NumDealsPurchases      < 0.0000000000000002 ***
#> NumWebPurchases              0.000000372885 ***
#> NumCatalogPurchases          0.000000020512 ***
#> NumStorePurchases            0.000015455846 ***
#> NumWebVisitsMonth            0.000041578161 ***
#> Response1                          0.456437    
#> Complain1                          0.301012    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2412.4  on 1769  degrees of freedom
#> Residual deviance: 1136.7  on 1738  degrees of freedom
#>   (22 observations deleted due to missingness)
#> AIC: 1200.7
#> 
#> Number of Fisher Scoring iterations: 14

model tanpa prdiktor

model_nopred <- glm(
  formula = Kidhome~1,
  data = store_train,
  family = "binomial",
  control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)

summary(model_nopred)

#> 
#> Call:
#> glm(formula = Kidhome ~ 1, family = "binomial", data = store_train, 
#>     control = list(trace = FALSE))
#> 
#> Deviance Residuals: 
#>    Min      1Q  Median      3Q     Max  
#> -1.054  -1.054  -1.054   1.306   1.306  
#> 
#> Coefficients:
#>             Estimate Std. Error z value       Pr(>|z|)    
#> (Intercept) -0.29680    0.04777  -6.214 0.000000000518 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2445.2  on 1791  degrees of freedom
#> Residual deviance: 2445.2  on 1791  degrees of freedom
#> AIC: 2447.2
#> 
#> Number of Fisher Scoring iterations: 4

Model numerik

model_num <- glm(
  formula = Kidhome~Income +Recency+ MntWines+ MntFruits+ MntMeatProducts+ MntFishProducts+ MntSweetProducts+ MntGoldProds+ NumDealsPurchases+ NumWebPurchases +NumCatalogPurchases+ NumStorePurchases+  NumWebVisitsMonth ,
  data = store_train,
  family = "binomial",
  control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)

summary(model_num)

#> 
#> Call:
#> glm(formula = Kidhome ~ Income + Recency + MntWines + MntFruits + 
#>     MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds + 
#>     NumDealsPurchases + NumWebPurchases + NumCatalogPurchases + 
#>     NumStorePurchases + NumWebVisitsMonth, family = "binomial", 
#>     data = store_train, control = list(trace = FALSE))
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -5.0795  -0.3980  -0.0841   0.6562   3.4899  
#> 
#> Coefficients:
#>                         Estimate   Std. Error z value             Pr(>|z|)    
#> (Intercept)         -0.234982120  0.482454607  -0.487             0.626219    
#> Income               0.000010444  0.000005923   1.763             0.077868 .  
#> Recency             -0.000184275  0.002395032  -0.077             0.938671    
#> MntWines            -0.001209705  0.000515461  -2.347             0.018933 *  
#> MntFruits            0.005859438  0.004043037   1.449             0.147263    
#> MntMeatProducts      0.000545122  0.000946741   0.576             0.564758    
#> MntFishProducts     -0.001250346  0.003225025  -0.388             0.698237    
#> MntSweetProducts    -0.004566148  0.004269923  -1.069             0.284901    
#> MntGoldProds        -0.008306630  0.002356900  -3.524             0.000424 ***
#> NumDealsPurchases    0.612076786  0.054019529  11.331 < 0.0000000000000002 ***
#> NumWebPurchases     -0.203500232  0.046858302  -4.343        0.00001406236 ***
#> NumCatalogPurchases -0.403959419  0.067247997  -6.007        0.00000000189 ***
#> NumStorePurchases   -0.229215580  0.048822726  -4.695        0.00000266797 ***
#> NumWebVisitsMonth    0.197799656  0.049553681   3.992        0.00006562239 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2412.4  on 1769  degrees of freedom
#> Residual deviance: 1309.7  on 1756  degrees of freedom
#>   (22 observations deleted due to missingness)
#> AIC: 1337.7
#> 
#> Number of Fisher Scoring iterations: 6

##Model kategorik

model_cat <- glm(
  formula = Kidhome~ Education+ Marital_Status+ Teenhome+ Response+ Complain,
  data = store_train,
  family = "binomial",
  control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)

summary(model_cat)

#> 
#> Call:
#> glm(formula = Kidhome ~ Education + Marital_Status + Teenhome + 
#>     Response + Complain, family = "binomial", data = store_train, 
#>     control = list(trace = FALSE))
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.4564  -1.0526  -0.9363   1.2403   1.9018  
#> 
#> Coefficients:
#>                          Estimate Std. Error z value Pr(>|z|)   
#> (Intercept)             -14.27361  617.82764  -0.023  0.98157   
#> EducationBasic            0.59864    0.35725   1.676  0.09379 . 
#> EducationGraduation      -0.07214    0.17531  -0.411  0.68072   
#> EducationMaster          -0.07140    0.20071  -0.356  0.72203   
#> EducationPhD             -0.26552    0.19554  -1.358  0.17451   
#> Marital_StatusAlone      29.30937  791.75515   0.037  0.97047   
#> Marital_StatusDivorced   14.23812  617.82764   0.023  0.98161   
#> Marital_StatusMarried    14.29450  617.82762   0.023  0.98154   
#> Marital_StatusSingle     14.31051  617.82763   0.023  0.98152   
#> Marital_StatusTogether   14.19903  617.82763   0.023  0.98166   
#> Marital_StatusWidow      13.41766  617.82770   0.022  0.98267   
#> Marital_StatusYOLO        0.22341 1077.47244   0.000  0.99983   
#> Teenhome1                -0.25034    0.10135  -2.470  0.01351 * 
#> Teenhome2                 0.01391    0.32177   0.043  0.96553   
#> Response1                -0.45093    0.14442  -3.122  0.00179 **
#> Complain1                 0.67053    0.53273   1.259  0.20815   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2445.2  on 1791  degrees of freedom
#> Residual deviance: 2397.4  on 1776  degrees of freedom
#> AIC: 2429.4
#> 
#> Number of Fisher Scoring iterations: 13

Prediction

Tanpa Prediktor

pred_prob <- predict(object = model_nopred, 
                     newdata = store_test, 
                     type = "response")

head(pred_prob)

#>         2         7        14        19        27        32 
#> 0.4263393 0.4263393 0.4263393 0.4263393 0.4263393 0.4263393

pred_label <- ifelse(pred_prob > 0.5, yes = 1, no = 0)

pred_label <- as.factor(pred_label)

head(pred_label)

#>  2  7 14 19 27 32 
#>  0  0  0  0  0  0 
#> Levels: 0

Pada enam data teratas hasil prediksi menunjukkan tidak terdapat pembeli dengan kemungkinan mepunyai anak.

semua Prediktor

pred_prob_all <- predict(object = model_all, 
                      newdata = store_test, 
                      type = "response")

head(pred_prob_all)

#>            2            7           14           19           27           32 
#> 0.0764422806 0.0005694837 0.8204826660 0.6512183530 0.9889631160 0.0008067461

pred_label_all <- ifelse(pred_prob_all > 0.5, yes = 1, no = 0)

pred_label_all <- as.factor(pred_label_all)

head(pred_label_all)

#>  2  7 14 19 27 32 
#>  0  0  1  1  1  0 
#> Levels: 0 1

Dengan model ini pada 2 data teratas ada 6 kemungkinan pembeli yang punya anak.

Kita perlu melakukan evaluasi model menggunakan fungsi confusionMatrix() dari library caret. Kita akan membandingkan hasil prediksi dengan referensi data yang kita punya.

confusionMatrix(data = pred_label_all, 
                reference = store_test$Kidhome, 
                positive = "1")

#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length

#> Warning in confusionMatrix.default(data = pred_label_all, reference =
#> store_test$Kidhome, : Levels are not in the same order for reference and data.
#> Refactoring data to match.

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1   2
#>          0 229  24   1
#>          1  34 151   7
#>          2   0   0   0
#> 
#> Overall Statistics
#>                                               
#>                Accuracy : 0.852               
#>                  95% CI : (0.8156, 0.8837)    
#>     No Information Rate : 0.5897              
#>     P-Value [Acc > NIR] : < 0.0000000000000002
#>                                               
#>                   Kappa : 0.7012              
#>                                               
#>  Mcnemar's Test P-Value : 0.02106             
#> 
#> Statistics by Class:
#> 
#>                      Class: 0 Class: 1 Class: 2
#> Sensitivity            0.8707   0.8629  0.00000
#> Specificity            0.8634   0.8487  1.00000
#> Pos Pred Value         0.9016   0.7865      NaN
#> Neg Pred Value         0.8229   0.9055  0.98206
#> Prevalence             0.5897   0.3924  0.01794
#> Detection Rate         0.5135   0.3386  0.00000
#> Detection Prevalence   0.5695   0.4305  0.00000
#> Balanced Accuracy      0.8671   0.8558  0.50000

kebaikan dari sebuah model berdasarkan nilai Accuracynya. Model yang kita punya memiliki nilai Accuracy 0,8.

Perdiksi Numerik

pred_prob_num <- predict(object = model_num, 
                      newdata = store_test, 
                      type = "response")

head(pred_prob_num)

#>            2            7           14           19           27           32 
#> 0.0434478656 0.0015170866 0.6213654876 0.8618457433 0.9814631845 0.0008988466

pred_label_num <- ifelse(pred_prob_num > 0.5, yes = 1, no = 0)

pred_label_num <- as.factor(pred_label_num)

head(pred_label_num)

#>  2  7 14 19 27 32 
#>  0  0  1  1  1  0 
#> Levels: 0 1

6 data teratas terdapat 1 pembeli yang kemungkinan mempunyai anak.

Prediksi kategorik

pred_prob_cat <- predict(object = model_cat, 
                      newdata = store_test, 
                      type = "response")

head(pred_prob_cat)

#>         2         7        14        19        27        32 
#> 0.3807955 0.3941141 0.4871891 0.4468402 0.4020166 0.4871891

pred_label_cat <- ifelse(pred_prob_cat > 0.5, yes = 1, no = 0)

pred_label_cat <- as.factor(pred_label_cat)

head(pred_label_cat)

#>  2  7 14 19 27 32 
#>  0  0  0  0  0  0 
#> Levels: 0 1

6 data teratas Tidak terdapat pembeli yang kemungkinan mempunyai anak

KNN

Mengecek data Train

pred_step_train <- predict(object = model_all, 
                           newdata = store_train, 
                           type = "response")

predLabel_train <- ifelse(test = pred_step_train > 0.5, yes = 1, no = 0)

cm_step_train <- confusionMatrix(data = as.factor(predLabel_train),
                                 reference = store_train$Kidhome,
                                 positive = "1")

#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length

#> Warning in confusionMatrix.default(data = as.factor(predLabel_train), reference
#> = store_train$Kidhome, : Levels are not in the same order for reference and
#> data. Refactoring data to match.

cm_step_train$overall[1] # accuracy

#>  Accuracy 
#> 0.8548023

cm_step_train$byClass[1] # recall/sensitivity

#> [1] 0.8627451

Menecek data Test

pred_step_test <- predict(object = model_all, 
                           newdata = store_test, 
                           type = "response")

predLabel_test <- ifelse(test = pred_step_test > 0.5, yes = 1, no = 0)

cm_step_test <- confusionMatrix(data = as.factor(predLabel_test),
                                 reference = store_test$Kidhome,
                                 positive = "1")

#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length

#> Warning in confusionMatrix.default(data = as.factor(predLabel_test), reference
#> = store_test$Kidhome, : Levels are not in the same order for reference and
#> data. Refactoring data to match.

cm_step_test$overall[1] # accuracy

#>  Accuracy 
#> 0.8520179

cm_step_test$byClass[1] # recall/sensitivity

#> [1] 0.8707224

Kesimpulan

Dari perbandingan nilai metrics antara data train dengan data test (pada kasus kita yaitu akurasi dan recall), kita bisa mengetahui apakah model kita overfit atau tidak. Biasanya sebuah model dikatakan overfit jika perbedaannya mencapai lebih dari 10% (atau 0.1).

Akurasi data train 0.8622247 Akurasi data test 0.8449438 -> Tidak Overfit

LBB Machine Learning

Budiman

Maret 23, 2023

Baca Data

Check Missing Value

Data Wrangling

Build Model

Tanpa Prediktor

Cross Validation

Modeling

model semua Prediktor

model tanpa prdiktor

Model numerik

Prediction

Tanpa Prediktor

semua Prediktor

Perdiksi Numerik

Prediksi kategorik

KNN

Mengecek data Train

Menecek data Test

Kesimpulan