# clear-up the environment
rm(list = ls())
# chunk options
knitr::opts_chunk$set(
message = FALSE,
fig.align = "center",
comment = "#>")
options(scipen = 9999)
# library
library(tidyverse)## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.0 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.1 v tibble 3.1.8
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(plotly)##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(data.table)##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidymodels)## -- Attaching packages -------------------------------------- tidymodels 1.0.0 --
## v broom 1.0.3 v rsample 1.1.1
## v dials 1.1.0 v tune 1.0.1
## v infer 1.0.4 v workflows 1.1.3
## v modeldata 1.1.0 v workflowsets 1.0.0
## v parsnip 1.0.4 v yardstick 1.1.0
## v recipes 1.0.5
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between() masks dplyr::between()
## x scales::discard() masks purrr::discard()
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x data.table::first() masks dplyr::first()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x caret::lift() masks purrr::lift()
## x yardstick::precision() masks caret::precision()
## x yardstick::recall() masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec() masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step() masks stats::step()
## x data.table::transpose() masks purrr::transpose()
## * Use tidymodels_prefer() to resolve common conflicts.
library(car)## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(scales)
library(lmtest)## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)store <- read.csv("data_input/superstore_data.csv")
glimpse(store)#> Rows: 2,240
#> Columns: 22
#> $ Id <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
unique(store$Complain)#> [1] 0 1
anyNA(store)#> [1] TRUE
colSums(is.na(x = store))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 24 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
Store <- store %>% filter(!is.na(Income))
colSums(is.na(x = Store))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 0 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
store_Clean <- store %>%
mutate(Dt_Customer = mdy(store$Dt_Customer)) %>%
mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)
# cek tipe data setelah diubah
str(store_Clean)#> 'data.frame': 2240 obs. of 22 variables:
#> $ Id : int 1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
#> $ Year_Birth : int 1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
#> $ Education : Factor w/ 5 levels "2n Cycle","Basic",..: 3 3 3 3 3 5 1 3 5 5 ...
#> $ Marital_Status : Factor w/ 8 levels "Absurd","Alone",..: 3 5 4 6 5 5 4 6 4 4 ...
#> $ Income : int 84835 57091 67267 32474 21474 71691 63564 44931 65324 65324 ...
#> $ Kidhome : Factor w/ 3 levels "0","1","2": 1 1 1 2 2 1 1 1 1 1 ...
#> $ Teenhome : Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 2 2 2 ...
#> $ Dt_Customer : Date, format: "2014-06-16" "2014-06-15" ...
#> $ Recency : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ MntWines : int 189 464 134 10 6 336 769 78 384 384 ...
#> $ MntFruits : int 104 5 11 0 16 130 80 0 0 0 ...
#> $ MntMeatProducts : int 379 64 59 1 24 411 252 11 102 102 ...
#> $ MntFishProducts : int 111 7 15 0 11 240 15 0 21 21 ...
#> $ MntSweetProducts : int 189 0 2 0 0 32 34 0 32 32 ...
#> $ MntGoldProds : int 218 37 30 0 34 43 65 7 5 5 ...
#> $ NumDealsPurchases : int 1 1 1 1 2 1 1 1 3 3 ...
#> $ NumWebPurchases : int 4 7 3 1 3 4 10 2 6 6 ...
#> $ NumCatalogPurchases: int 4 3 2 0 1 7 10 1 2 2 ...
#> $ NumStorePurchases : int 6 7 5 2 2 5 7 3 9 9 ...
#> $ NumWebVisitsMonth : int 1 5 2 7 7 2 6 5 4 4 ...
#> $ Response : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 1 1 1 ...
#> $ Complain : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
model_null <- glm(
formula = Kidhome ~1,
data = store_Clean,
family = "binomial"
)
summary(model_null)#>
#> Call:
#> glm(formula = Kidhome ~ 1, family = "binomial", data = store_Clean)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.048 -1.048 -1.048 1.312 1.312
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.31142 0.04277 -7.281 0.000000000000331 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 3051.6 on 2239 degrees of freedom
#> Residual deviance: 3051.6 on 2239 degrees of freedom
#> AIC: 3053.6
#>
#> Number of Fisher Scoring iterations: 4
# probability
table(store_Clean$Kidhome)#>
#> 0 1 2
#> 1293 899 48
ggcorr(store_Clean, label = T)#> Warning in ggcorr(store_Clean, label = T): data in column(s) 'Education',
#> 'Marital_Status', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Response', 'Complain'
#> are not numeric and were ignored
#RNGkind(sample.kind = "Rounding")
set.seed(147)
index <- sample(x = nrow(store_Clean), size = nrow(store_Clean)*0.8)
store_train <- store_Clean[index,]
store_test <- store_Clean[-index,]model_all <- glm(
formula = Kidhome ~.,
data = store_train,
family = "binomial",
control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)
summary(model_all)#>
#> Call:
#> glm(formula = Kidhome ~ ., family = "binomial", data = store_train,
#> control = list(trace = FALSE))
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -5.7269 -0.3463 -0.0577 0.5089 3.2178
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) -126.090853198 934.176003698 -0.135
#> Id -0.000006528 0.000022899 -0.285
#> Year_Birth 0.051598645 0.008098314 6.372
#> EducationBasic -0.539361619 0.448277227 -1.203
#> EducationGraduation 0.426501181 0.275932928 1.546
#> EducationMaster 0.410366550 0.318140954 1.290
#> EducationPhD 0.295361858 0.314886074 0.938
#> Marital_StatusAlone 26.369082507 1167.302836920 0.023
#> Marital_StatusDivorced 11.887591719 934.021779576 0.013
#> Marital_StatusMarried 11.798742654 934.021757717 0.013
#> Marital_StatusSingle 11.643692285 934.021755511 0.012
#> Marital_StatusTogether 12.003778521 934.021753355 0.013
#> Marital_StatusWidow 11.573151463 934.021837338 0.012
#> Marital_StatusYOLO -4.718215098 1729.329001586 -0.003
#> Income 0.000036463 0.000008683 4.199
#> Teenhome1 -1.361870571 0.213149369 -6.389
#> Teenhome2 -1.115873563 0.513540170 -2.173
#> Dt_Customer 0.000726071 0.000369960 1.963
#> Recency 0.000230072 0.002673722 0.086
#> MntWines -0.000910599 0.000529870 -1.719
#> MntFruits 0.004701890 0.004329529 1.086
#> MntMeatProducts -0.002381762 0.001169429 -2.037
#> MntFishProducts -0.001141437 0.003452625 -0.331
#> MntSweetProducts -0.006117263 0.004756681 -1.286
#> MntGoldProds -0.009497921 0.002708254 -3.507
#> NumDealsPurchases 0.823886625 0.066846047 12.325
#> NumWebPurchases -0.264167679 0.051977942 -5.082
#> NumCatalogPurchases -0.432025030 0.077042374 -5.608
#> NumStorePurchases -0.226591710 0.052426427 -4.322
#> NumWebVisitsMonth 0.216755329 0.052886110 4.099
#> Response1 0.208398222 0.279832098 0.745
#> Complain1 0.940319776 0.909165920 1.034
#> Pr(>|z|)
#> (Intercept) 0.892631
#> Id 0.775592
#> Year_Birth 0.000000000187 ***
#> EducationBasic 0.228904
#> EducationGraduation 0.122184
#> EducationMaster 0.197089
#> EducationPhD 0.348246
#> Marital_StatusAlone 0.981978
#> Marital_StatusDivorced 0.989845
#> Marital_StatusMarried 0.989921
#> Marital_StatusSingle 0.990054
#> Marital_StatusTogether 0.989746
#> Marital_StatusWidow 0.990114
#> Marital_StatusYOLO 0.997823
#> Income 0.000026770272 ***
#> Teenhome1 0.000000000167 ***
#> Teenhome2 0.029788 *
#> Dt_Customer 0.049697 *
#> Recency 0.931427
#> MntWines 0.085700 .
#> MntFruits 0.277477
#> MntMeatProducts 0.041681 *
#> MntFishProducts 0.740947
#> MntSweetProducts 0.198431
#> MntGoldProds 0.000453 ***
#> NumDealsPurchases < 0.0000000000000002 ***
#> NumWebPurchases 0.000000372885 ***
#> NumCatalogPurchases 0.000000020512 ***
#> NumStorePurchases 0.000015455846 ***
#> NumWebVisitsMonth 0.000041578161 ***
#> Response1 0.456437
#> Complain1 0.301012
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2412.4 on 1769 degrees of freedom
#> Residual deviance: 1136.7 on 1738 degrees of freedom
#> (22 observations deleted due to missingness)
#> AIC: 1200.7
#>
#> Number of Fisher Scoring iterations: 14
model_nopred <- glm(
formula = Kidhome~1,
data = store_train,
family = "binomial",
control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)
summary(model_nopred)#>
#> Call:
#> glm(formula = Kidhome ~ 1, family = "binomial", data = store_train,
#> control = list(trace = FALSE))
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.054 -1.054 -1.054 1.306 1.306
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.29680 0.04777 -6.214 0.000000000518 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2445.2 on 1791 degrees of freedom
#> Residual deviance: 2445.2 on 1791 degrees of freedom
#> AIC: 2447.2
#>
#> Number of Fisher Scoring iterations: 4
model_num <- glm(
formula = Kidhome~Income +Recency+ MntWines+ MntFruits+ MntMeatProducts+ MntFishProducts+ MntSweetProducts+ MntGoldProds+ NumDealsPurchases+ NumWebPurchases +NumCatalogPurchases+ NumStorePurchases+ NumWebVisitsMonth ,
data = store_train,
family = "binomial",
control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)
summary(model_num)#>
#> Call:
#> glm(formula = Kidhome ~ Income + Recency + MntWines + MntFruits +
#> MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds +
#> NumDealsPurchases + NumWebPurchases + NumCatalogPurchases +
#> NumStorePurchases + NumWebVisitsMonth, family = "binomial",
#> data = store_train, control = list(trace = FALSE))
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -5.0795 -0.3980 -0.0841 0.6562 3.4899
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.234982120 0.482454607 -0.487 0.626219
#> Income 0.000010444 0.000005923 1.763 0.077868 .
#> Recency -0.000184275 0.002395032 -0.077 0.938671
#> MntWines -0.001209705 0.000515461 -2.347 0.018933 *
#> MntFruits 0.005859438 0.004043037 1.449 0.147263
#> MntMeatProducts 0.000545122 0.000946741 0.576 0.564758
#> MntFishProducts -0.001250346 0.003225025 -0.388 0.698237
#> MntSweetProducts -0.004566148 0.004269923 -1.069 0.284901
#> MntGoldProds -0.008306630 0.002356900 -3.524 0.000424 ***
#> NumDealsPurchases 0.612076786 0.054019529 11.331 < 0.0000000000000002 ***
#> NumWebPurchases -0.203500232 0.046858302 -4.343 0.00001406236 ***
#> NumCatalogPurchases -0.403959419 0.067247997 -6.007 0.00000000189 ***
#> NumStorePurchases -0.229215580 0.048822726 -4.695 0.00000266797 ***
#> NumWebVisitsMonth 0.197799656 0.049553681 3.992 0.00006562239 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2412.4 on 1769 degrees of freedom
#> Residual deviance: 1309.7 on 1756 degrees of freedom
#> (22 observations deleted due to missingness)
#> AIC: 1337.7
#>
#> Number of Fisher Scoring iterations: 6
##Model kategorik
model_cat <- glm(
formula = Kidhome~ Education+ Marital_Status+ Teenhome+ Response+ Complain,
data = store_train,
family = "binomial",
control = list(trace=FALSE) # cek perubahan deviasi setiap iterasi
)
summary(model_cat)#>
#> Call:
#> glm(formula = Kidhome ~ Education + Marital_Status + Teenhome +
#> Response + Complain, family = "binomial", data = store_train,
#> control = list(trace = FALSE))
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.4564 -1.0526 -0.9363 1.2403 1.9018
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -14.27361 617.82764 -0.023 0.98157
#> EducationBasic 0.59864 0.35725 1.676 0.09379 .
#> EducationGraduation -0.07214 0.17531 -0.411 0.68072
#> EducationMaster -0.07140 0.20071 -0.356 0.72203
#> EducationPhD -0.26552 0.19554 -1.358 0.17451
#> Marital_StatusAlone 29.30937 791.75515 0.037 0.97047
#> Marital_StatusDivorced 14.23812 617.82764 0.023 0.98161
#> Marital_StatusMarried 14.29450 617.82762 0.023 0.98154
#> Marital_StatusSingle 14.31051 617.82763 0.023 0.98152
#> Marital_StatusTogether 14.19903 617.82763 0.023 0.98166
#> Marital_StatusWidow 13.41766 617.82770 0.022 0.98267
#> Marital_StatusYOLO 0.22341 1077.47244 0.000 0.99983
#> Teenhome1 -0.25034 0.10135 -2.470 0.01351 *
#> Teenhome2 0.01391 0.32177 0.043 0.96553
#> Response1 -0.45093 0.14442 -3.122 0.00179 **
#> Complain1 0.67053 0.53273 1.259 0.20815
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2445.2 on 1791 degrees of freedom
#> Residual deviance: 2397.4 on 1776 degrees of freedom
#> AIC: 2429.4
#>
#> Number of Fisher Scoring iterations: 13
pred_prob <- predict(object = model_nopred,
newdata = store_test,
type = "response")
head(pred_prob)#> 2 7 14 19 27 32
#> 0.4263393 0.4263393 0.4263393 0.4263393 0.4263393 0.4263393
pred_label <- ifelse(pred_prob > 0.5, yes = 1, no = 0)
pred_label <- as.factor(pred_label)
head(pred_label)#> 2 7 14 19 27 32
#> 0 0 0 0 0 0
#> Levels: 0
Pada enam data teratas hasil prediksi menunjukkan tidak terdapat pembeli dengan kemungkinan mepunyai anak.
pred_prob_all <- predict(object = model_all,
newdata = store_test,
type = "response")
head(pred_prob_all)#> 2 7 14 19 27 32
#> 0.0764422806 0.0005694837 0.8204826660 0.6512183530 0.9889631160 0.0008067461
pred_label_all <- ifelse(pred_prob_all > 0.5, yes = 1, no = 0)
pred_label_all <- as.factor(pred_label_all)
head(pred_label_all)#> 2 7 14 19 27 32
#> 0 0 1 1 1 0
#> Levels: 0 1
Dengan model ini pada 2 data teratas ada 6 kemungkinan pembeli yang punya anak.
Kita perlu melakukan evaluasi model menggunakan fungsi confusionMatrix() dari library caret. Kita akan membandingkan hasil prediksi dengan referensi data yang kita punya.
confusionMatrix(data = pred_label_all,
reference = store_test$Kidhome,
positive = "1")#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length
#> Warning in confusionMatrix.default(data = pred_label_all, reference =
#> store_test$Kidhome, : Levels are not in the same order for reference and data.
#> Refactoring data to match.
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1 2
#> 0 229 24 1
#> 1 34 151 7
#> 2 0 0 0
#>
#> Overall Statistics
#>
#> Accuracy : 0.852
#> 95% CI : (0.8156, 0.8837)
#> No Information Rate : 0.5897
#> P-Value [Acc > NIR] : < 0.0000000000000002
#>
#> Kappa : 0.7012
#>
#> Mcnemar's Test P-Value : 0.02106
#>
#> Statistics by Class:
#>
#> Class: 0 Class: 1 Class: 2
#> Sensitivity 0.8707 0.8629 0.00000
#> Specificity 0.8634 0.8487 1.00000
#> Pos Pred Value 0.9016 0.7865 NaN
#> Neg Pred Value 0.8229 0.9055 0.98206
#> Prevalence 0.5897 0.3924 0.01794
#> Detection Rate 0.5135 0.3386 0.00000
#> Detection Prevalence 0.5695 0.4305 0.00000
#> Balanced Accuracy 0.8671 0.8558 0.50000
kebaikan dari sebuah model berdasarkan nilai Accuracynya. Model yang kita punya memiliki nilai Accuracy 0,8.
pred_prob_num <- predict(object = model_num,
newdata = store_test,
type = "response")
head(pred_prob_num)#> 2 7 14 19 27 32
#> 0.0434478656 0.0015170866 0.6213654876 0.8618457433 0.9814631845 0.0008988466
pred_label_num <- ifelse(pred_prob_num > 0.5, yes = 1, no = 0)
pred_label_num <- as.factor(pred_label_num)
head(pred_label_num)#> 2 7 14 19 27 32
#> 0 0 1 1 1 0
#> Levels: 0 1
6 data teratas terdapat 1 pembeli yang kemungkinan mempunyai anak.
pred_prob_cat <- predict(object = model_cat,
newdata = store_test,
type = "response")
head(pred_prob_cat)#> 2 7 14 19 27 32
#> 0.3807955 0.3941141 0.4871891 0.4468402 0.4020166 0.4871891
pred_label_cat <- ifelse(pred_prob_cat > 0.5, yes = 1, no = 0)
pred_label_cat <- as.factor(pred_label_cat)
head(pred_label_cat)#> 2 7 14 19 27 32
#> 0 0 0 0 0 0
#> Levels: 0 1
6 data teratas Tidak terdapat pembeli yang kemungkinan mempunyai anak
pred_step_train <- predict(object = model_all,
newdata = store_train,
type = "response")
predLabel_train <- ifelse(test = pred_step_train > 0.5, yes = 1, no = 0)
cm_step_train <- confusionMatrix(data = as.factor(predLabel_train),
reference = store_train$Kidhome,
positive = "1")#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length
#> Warning in confusionMatrix.default(data = as.factor(predLabel_train), reference
#> = store_train$Kidhome, : Levels are not in the same order for reference and
#> data. Refactoring data to match.
cm_step_train$overall[1] # accuracy#> Accuracy
#> 0.8548023
cm_step_train$byClass[1] # recall/sensitivity#> [1] 0.8627451
pred_step_test <- predict(object = model_all,
newdata = store_test,
type = "response")
predLabel_test <- ifelse(test = pred_step_test > 0.5, yes = 1, no = 0)
cm_step_test <- confusionMatrix(data = as.factor(predLabel_test),
reference = store_test$Kidhome,
positive = "1")#> Warning in levels(reference) != levels(data): longer object length is not a
#> multiple of shorter object length
#> Warning in confusionMatrix.default(data = as.factor(predLabel_test), reference
#> = store_test$Kidhome, : Levels are not in the same order for reference and
#> data. Refactoring data to match.
cm_step_test$overall[1] # accuracy#> Accuracy
#> 0.8520179
cm_step_test$byClass[1] # recall/sensitivity#> [1] 0.8707224
Dari perbandingan nilai metrics antara data train dengan data test (pada kasus kita yaitu akurasi dan recall), kita bisa mengetahui apakah model kita overfit atau tidak. Biasanya sebuah model dikatakan overfit jika perbedaannya mencapai lebih dari 10% (atau 0.1).
Akurasi data train 0.8622247 Akurasi data test 0.8449438 -> Tidak Overfit