# clear-up the environment
rm(list = ls())

# chunk options
knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  fig.align = "center",
  comment = "#>"
)
options(scipen = 99)
knitr::include_graphics("assets/store.png")

Introduction

library(tidyverse)
library(caret)
library(plotly)
library(data.table)
library(GGally)
library(tidymodels)
library(car)
library(scales)
library(lmtest)
library(dplyr)

Persiapan Data

Baca Data

store <- read.csv("data_input/superstore_data.csv")
head(store)

Deskripsi Tabel

ID - ID unik dari setiap pelanggan Year_Birth - Usia pelanggan Complain - 1 jika pelanggan melakukan komplain dalam 2 tahun terakhir Dt_Customer - tanggal pendaftaran pelanggan dengan perusahaan Education - tingkat pendidikan pelanggan Marital - status perkawinan pelanggan Kidhome - jumlah anak kecil di rumah pelanggan Teenhome - jumlah remaja dalam rumah tangga pelanggan Income - pendapatan rumah tangga tahunan pelanggan MntFishProducts - jumlah yang dibelanjakan untuk produk ikan dalam 2 tahun terakhir MntMeatProducts - jumlah yang dihabiskan untuk produk daging dalam 2 tahun terakhir MntFruits - jumlah yang dibelanjakan untuk produk buah-buahan dalam 2 tahun terakhir MntSweetProducts - jumlah yang dibelanjakan untuk produk manis dalam 2 tahun terakhir MntWines - jumlah yang dibelanjakan untuk produk anggur dalam 2 tahun terakhir MntGoldProds - jumlah yang dibelanjakan untuk produk emas dalam 2 tahun terakhir NumDealsPurchases - jumlah pembelian yang dilakukan dengan diskon NumCatalogPurchases - jumlah pembelian yang dilakukan menggunakan katalog (membeli barang untuk dikirim melalui pos) NumStorePurchases - jumlah pembelian yang dilakukan langsung di toko NumWebPurchases - jumlah pembelian yang dilakukan melalui situs web perusahaan NumWebVisitsMonth - jumlah kunjungan ke situs web perusahaan dalam sebulan terakhir Recency - jumlah hari sejak pembelian terakhir

Melihat Tipe Data

glimpse(store)
#> Rows: 2,240
#> Columns: 22
#> $ Id                  <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth          <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education           <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status      <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income              <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome             <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome            <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer         <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines            <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits           <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts     <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts     <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts    <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds        <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases   <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases     <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases   <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth   <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response            <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~

Merubah tipe data

menjadi Factor kolom = Education, Marital_Status, Kidhome, Teenhome, Response, Complain

store_Clean <- store %>% 
  mutate(Dt_Customer = mdy(store$Dt_Customer)) %>% 
  mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)

Cek Missing Value

anyNA(store_Clean)
#> [1] TRUE
colSums(is.na(x = store_Clean))
#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                  24                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0
Store_Clean1 <- store_Clean %>% filter(!is.na(Income))
colSums(is.na(x = Store_Clean1))
#>                  Id          Year_Birth           Education      Marital_Status 
#>                   0                   0                   0                   0 
#>              Income             Kidhome            Teenhome         Dt_Customer 
#>                   0                   0                   0                   0 
#>             Recency            MntWines           MntFruits     MntMeatProducts 
#>                   0                   0                   0                   0 
#>     MntFishProducts    MntSweetProducts        MntGoldProds   NumDealsPurchases 
#>                   0                   0                   0                   0 
#>     NumWebPurchases NumCatalogPurchases   NumStorePurchases   NumWebVisitsMonth 
#>                   0                   0                   0                   0 
#>            Response            Complain 
#>                   0                   0

Analisis Data Eksplorasi (EDA)

Variabel cek persebaran data :

Jumlah yang dihabiskan untuk produk daging dalam 2 tahun terakhir (MntMeatProducts)

boxplot(Store_Clean1$MntMeatProducts , horizontal = T)

Persebaran Data Jumlah pembelian yang dilakukan melalui situs web perusahaan (NumWebPurchases)

boxplot(store_Clean$NumWebPurchases, horizontal = T)

cor(y = Store_Clean1$NumWebPurchases, x= Store_Clean1$MntMeatProducts)
#> [1] 0.3070904
# visualisasi scatter plot
plot(x = Store_Clean1$MntMeatProducts , y = Store_Clean1$NumWebPurchases)

Re-Modeling

ggcorr(store_Clean, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)

Pada grafik korelasi, terlihat bahwa tidak semua variabel memiliki pengaruh positif terhadap NumWebPurchases dimana faktor MntMeatProducts memiliki korelasi positif yang paling tinggi dibandingkan faktor-faktor lain.

Pembuatan Model simple Regresi Linear

model_clean1 <- lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
summary(model_clean1)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.9324 -1.7991 -0.5788  1.4415 23.5302 
#> 
#> Coefficients:
#>                  Estimate Std. Error t value            Pr(>|t|)    
#> (Intercept)     3.4585646  0.0691068   50.05 <0.0000000000000002 ***
#> MntMeatProducts 0.0037529  0.0002472   15.18 <0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.609 on 2214 degrees of freedom
#> Multiple R-squared:  0.0943, Adjusted R-squared:  0.0939 
#> F-statistic: 230.5 on 1 and 2214 DF,  p-value: < 0.00000000000000022
plot(y = Store_Clean1$NumWebPurchases, x= Store_Clean1$MntMeatProducts)
abline(model_clean1, col = "red")

Model Interpretation

summary(model_clean1)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.9324 -1.7991 -0.5788  1.4415 23.5302 
#> 
#> Coefficients:
#>                  Estimate Std. Error t value            Pr(>|t|)    
#> (Intercept)     3.4585646  0.0691068   50.05 <0.0000000000000002 ***
#> MntMeatProducts 0.0037529  0.0002472   15.18 <0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.609 on 2214 degrees of freedom
#> Multiple R-squared:  0.0943, Adjusted R-squared:  0.0939 
#> F-statistic: 230.5 on 1 and 2214 DF,  p-value: < 0.00000000000000022
store_no_outlier <- store [store$MntMeatProducts < 1000,]
range(store_no_outlier$MntMeatProducts)
#> [1]   0 984

Pembuatan Model tanpa Outlier

model_no_outlier <- lm(NumWebPurchases ~ MntMeatProducts, data = store_no_outlier)
summary(model_no_outlier)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = store_no_outlier)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.4800 -1.9942 -0.5666  1.4443 23.5968 
#> 
#> Coefficients:
#>                  Estimate Std. Error t value            Pr(>|t|)    
#> (Intercept)     3.3903011  0.0696175   48.70 <0.0000000000000002 ***
#> MntMeatProducts 0.0043004  0.0002579   16.67 <0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.618 on 2233 degrees of freedom
#> Multiple R-squared:  0.1107, Adjusted R-squared:  0.1103 
#> F-statistic:   278 on 1 and 2233 DF,  p-value: < 0.00000000000000022

Bandingkan model yang dibentuk dari model_sales dan model_no_outlier, apakah data outlier mempengaruhi garis regresi yang jauh berbeda?

plot(store$MntMeatProducts, store$NumWebPurchases)
abline(model_clean1, col = "red")
abline(model_no_outlier, col = "blue")

cek nilai r-squared

# cek nilai r-squared
summary(model_clean1)$r.squared #model dengan outlier
#> [1] 0.09430449
summary(model_no_outlier)$r.squared # model tanpa outlier
#> [1] 0.110715

Kesimpulan Model tanpa outlier memiliki r-squared lebih tinggi = 0.110715 dengan model tanpa outlier = 0.09430449

Buat model dengan keseluruhan prediksi

model_all_store <- lm(NumWebPurchases~., store_Clean) 
summary(model_all_store) 
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ ., data = store_Clean)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.3529 -0.9637 -0.1509  0.9010 23.7637 
#> 
#> Coefficients:
#>                             Estimate    Std. Error t value             Pr(>|t|)
#> (Intercept)            -2.5853882648  8.6944565182  -0.297              0.76622
#> Id                     -0.0000002729  0.0000128821  -0.021              0.98310
#> Year_Birth             -0.0035361383  0.0039797050  -0.889              0.37435
#> EducationBasic         -0.2938676511  0.3070924998  -0.957              0.33871
#> EducationGraduation     0.0550318599  0.1520761007   0.362              0.71748
#> EducationMaster         0.0433357009  0.1771672327   0.245              0.80679
#> EducationPhD            0.1993237966  0.1738623532   1.146              0.25174
#> Marital_StatusAlone     3.1861892497  1.8035787665   1.767              0.07744
#> Marital_StatusDivorced  1.6690236143  1.4087613993   1.185              0.23625
#> Marital_StatusMarried   1.6915117493  1.4045489052   1.204              0.22860
#> Marital_StatusSingle    1.6096473537  1.4051248829   1.146              0.25211
#> Marital_StatusTogether  1.7208509604  1.4051525403   1.225              0.22083
#> Marital_StatusWidow     1.5729772167  1.4205336628   1.107              0.26828
#> Marital_StatusYOLO      2.6866932688  1.9785289767   1.358              0.17463
#> Income                  0.0000141856  0.0000024465   5.798 0.000000007667477707
#> Kidhome1               -0.8773656766  0.1202304716  -7.297 0.000000000000408809
#> Kidhome2               -0.8323827823  0.3064047138  -2.717              0.00665
#> Teenhome1               0.3085581387  0.1087095386   2.838              0.00458
#> Teenhome2               0.4276249649  0.2921233143   1.464              0.14338
#> Dt_Customer             0.0004265469  0.0002017842   2.114              0.03464
#> Recency                 0.0002166133  0.0014891217   0.145              0.88436
#> MntWines                0.0020002709  0.0002027967   9.863 < 0.0000000000000002
#> MntFruits               0.0022930035  0.0014593869   1.571              0.11628
#> MntMeatProducts        -0.0004735203  0.0003251580  -1.456              0.14546
#> MntFishProducts         0.0017930781  0.0011110932   1.614              0.10672
#> MntSweetProducts        0.0075534688  0.0014005875   5.393 0.000000076743683474
#> MntGoldProds            0.0078796751  0.0009801324   8.039 0.000000000000001464
#> NumDealsPurchases       0.2288500501  0.0278598811   8.214 0.000000000000000361
#> NumCatalogPurchases    -0.0051399910  0.0248487795  -0.207              0.83615
#> NumStorePurchases       0.1710622355  0.0196050252   8.725 < 0.0000000000000002
#> NumWebVisitsMonth       0.3390451102  0.0266330644  12.730 < 0.0000000000000002
#> Response1               0.2964918904  0.1322867408   2.241              0.02511
#> Complain1               0.2585588708  0.4324802329   0.598              0.55000
#>                           
#> (Intercept)               
#> Id                        
#> Year_Birth                
#> EducationBasic            
#> EducationGraduation       
#> EducationMaster           
#> EducationPhD              
#> Marital_StatusAlone    .  
#> Marital_StatusDivorced    
#> Marital_StatusMarried     
#> Marital_StatusSingle      
#> Marital_StatusTogether    
#> Marital_StatusWidow       
#> Marital_StatusYOLO        
#> Income                 ***
#> Kidhome1               ***
#> Kidhome2               ** 
#> Teenhome1              ** 
#> Teenhome2                 
#> Dt_Customer            *  
#> Recency                   
#> MntWines               ***
#> MntFruits                 
#> MntMeatProducts           
#> MntFishProducts           
#> MntSweetProducts       ***
#> MntGoldProds           ***
#> NumDealsPurchases      ***
#> NumCatalogPurchases       
#> NumStorePurchases      ***
#> NumWebVisitsMonth      ***
#> Response1              *  
#> Complain1                 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.958 on 2183 degrees of freedom
#>   (24 observations deleted due to missingness)
#> Multiple R-squared:  0.497,  Adjusted R-squared:  0.4897 
#> F-statistic: 67.41 on 32 and 2183 DF,  p-value: < 0.00000000000000022

Buat model dengan data Korelasi

model_cor <- lm(formula = NumWebPurchases ~ MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts +  MntGoldProds, data = Store_Clean1)
model_cor
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + MntFruits + MntMeatProducts + 
#>     MntFishProducts + MntSweetProducts + MntGoldProds, data = Store_Clean1)
#> 
#> Coefficients:
#>      (Intercept)          MntWines         MntFruits   MntMeatProducts  
#>        2.3844172         0.0039358         0.0022276        -0.0015250  
#>  MntFishProducts  MntSweetProducts      MntGoldProds  
#>        0.0001332         0.0078853         0.0108705

Membuat Prediksi

# simpan hasil prediksi ke kolom baru di dataframe copy yaitu crime_pred
store_pred <- Store_Clean1
store_pred$pred_clean1 <- predict(model_clean1, Store_Clean1)
store_pred$pred_all_store <- predict(model_all_store, Store_Clean1)
store_pred$pred_model_cor <- predict(model_cor, Store_Clean1)
head(store_pred)

Model Comparison

Tujuan: mendapatkan model terbaik untuk prediksi variabel target.

summary(model_clean1)$r.squared
#> [1] 0.09430449
summary(model_all_store)$adj.r.squared
#> [1] 0.4896637
summary(model_cor)$adj.r.squared
#> [1] 0.3618629

Model Evaluation

metrik error yang digunakan pada regresi:

Mean Absolute Error (MAE)

library(MLmetrics)
MAE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)
#> [1] 1.312026
range(Store_Clean1$MntMeatProducts)
#> [1]    0 1725

Mean Absolute Percentage Error (MAPE)

MAPE menunjukkan seberapa besar penyimpangannya dalam bentuk persentase.

MAPE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)*100
#> [1] Inf

Mean Squared Error (MSE)

selisih kuadrat dari hasil prediksi dan nilai aktual kemudian dirata-rata

MSE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)
#> [1] 3.776966

Root Mean Squared Error (RMSE)

RMSE(y_pred = store_pred$pred_all_store, 
     y_true = Store_Clean1$NumWebPurchases)
#> [1] 1.943442
range(Store_Clean1$NumWebPurchases)
#> [1]  0 27

Step-wise

Step-wise regression membantu kita memilih prediktor yang baik, dengan cara mencari kombinasi prediktor yang menghasilkan model terbaik berdasarkan nilai AIC

Model Backward

#stepwise regression: backward elimination
model_backward <- stats::step(object = model_all_store,
                        direction = "backward",
                        trace = FALSE)
# summary model backward
summary(model_backward)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ Income + Kidhome + Teenhome + 
#>     Dt_Customer + MntWines + MntFruits + MntMeatProducts + MntFishProducts + 
#>     MntSweetProducts + MntGoldProds + NumDealsPurchases + NumStorePurchases + 
#>     NumWebVisitsMonth + Response, data = store_Clean)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.3212 -0.9458 -0.1334  0.9153 23.9251 
#> 
#> Coefficients:
#>                       Estimate   Std. Error t value             Pr(>|t|)    
#> (Intercept)       -8.313975021  3.239692525  -2.566             0.010345 *  
#> Income             0.000014762  0.000002415   6.112  0.00000000116259593 ***
#> Kidhome1          -0.881187444  0.116349348  -7.574  0.00000000000005309 ***
#> Kidhome2          -0.809106905  0.304560467  -2.657             0.007949 ** 
#> Teenhome1          0.365722380  0.102975140   3.552             0.000391 ***
#> Teenhome2          0.514444923  0.287667672   1.788             0.073860 .  
#> Dt_Customer        0.000455682  0.000199354   2.286             0.022361 *  
#> MntWines           0.002062428  0.000191867  10.749 < 0.0000000000000002 ***
#> MntFruits          0.002156572  0.001453187   1.484             0.137945    
#> MntMeatProducts   -0.000482000  0.000300620  -1.603             0.109001    
#> MntFishProducts    0.001564030  0.001094575   1.429             0.153177    
#> MntSweetProducts   0.007408438  0.001385002   5.349  0.00000009758678639 ***
#> MntGoldProds       0.007550023  0.000955849   7.899  0.00000000000000441 ***
#> NumDealsPurchases  0.227970099  0.027201507   8.381 < 0.0000000000000002 ***
#> NumStorePurchases  0.171894790  0.019414933   8.854 < 0.0000000000000002 ***
#> NumWebVisitsMonth  0.339253666  0.025945182  13.076 < 0.0000000000000002 ***
#> Response1          0.301035924  0.126114476   2.387             0.017070 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.956 on 2199 degrees of freedom
#>   (24 observations deleted due to missingness)
#> Multiple R-squared:  0.4947, Adjusted R-squared:  0.491 
#> F-statistic: 134.5 on 16 and 2199 DF,  p-value: < 0.00000000000000022

Model forward

model_store_none <- lm( NumWebPurchases ~ 1, Store_Clean1)
model_forward <- stats::step(object = model_store_none, 
                      direction = "forward", 
                      scope = list(lower = model_store_none, upper = model_all_store))
#> Start:  AIC=4469.81
#> NumWebPurchases ~ 1
#> 
#>                       Df Sum of Sq   RSS    AIC
#> + MntWines             1    5103.4 11538 3660.2
#> + NumStorePurchases    1    4434.9 12206 3785.0
#> + MntGoldProds         1    2757.4 13883 4070.3
#> + Income               1    2503.6 14137 4110.5
#> + NumCatalogPurchases  1    2490.6 14150 4112.5
#> + Kidhome              2    2447.3 14194 4121.3
#> + MntSweetProducts     1    1855.7 14785 4209.8
#> + MntMeatProducts      1    1569.3 15072 4252.3
#> + MntFruits            1    1518.1 15123 4259.8
#> + MntFishProducts      1    1494.6 15146 4263.3
#> + NumDealsPurchases    1     970.1 15671 4338.7
#> + Dt_Customer          1     494.2 16147 4405.0
#> + Teenhome             2     450.7 16190 4413.0
#> + Year_Birth           1     389.8 16251 4419.3
#> + Response             1     381.6 16259 4420.4
#> + Education            4     340.9 16300 4431.9
#> + NumWebVisitsMonth    1      43.7 16597 4466.0
#> <none>                             16641 4469.8
#> + Id                   1       5.7 16635 4471.1
#> + Complain             1       4.6 16636 4471.2
#> + Recency              1       0.5 16640 4471.7
#> + Marital_Status       7      79.5 16561 4473.2
#> 
#> Step:  AIC=3660.17
#> NumWebPurchases ~ MntWines
#> 
#>                       Df Sum of Sq   RSS    AIC
#> + NumDealsPurchases    1    930.99 10606 3475.7
#> + NumStorePurchases    1    738.00 10800 3515.7
#> + MntGoldProds         1    707.13 10830 3522.0
#> + Teenhome             2    454.46 11083 3575.1
#> + NumWebVisitsMonth    1    299.82 11238 3603.8
#> + MntSweetProducts     1    272.34 11265 3609.2
#> + Kidhome              2    225.81 11312 3620.4
#> + MntFruits            1    150.57 11387 3633.1
#> + Dt_Customer          1    137.77 11400 3635.6
#> + MntFishProducts      1    124.73 11413 3638.1
#> + Income               1    113.75 11424 3640.2
#> + Year_Birth           1     71.59 11466 3648.4
#> + Education            4     77.50 11460 3653.2
#> + NumCatalogPurchases  1     34.83 11503 3655.5
#> <none>                             11538 3660.2
#> + Response             1      4.00 11534 3661.4
#> + Recency              1      3.43 11534 3661.5
#> + MntMeatProducts      1      1.55 11536 3661.9
#> + Id                   1      0.77 11537 3662.0
#> + Complain             1      0.45 11537 3662.1
#> + Marital_Status       7     47.43 11490 3665.0
#> 
#> Step:  AIC=3475.73
#> NumWebPurchases ~ MntWines + NumDealsPurchases
#> 
#>                       Df Sum of Sq     RSS    AIC
#> + MntGoldProds         1    626.01  9980.5 3342.9
#> + NumStorePurchases    1    617.21  9989.3 3344.9
#> + Kidhome              2    587.67 10018.8 3353.4
#> + MntSweetProducts     1    434.09 10172.4 3385.1
#> + MntFruits            1    289.94 10316.5 3416.3
#> + MntFishProducts      1    264.36 10342.1 3421.8
#> + Income               1    197.38 10409.1 3436.1
#> + Teenhome             2    102.48 10504.0 3458.2
#> + Year_Birth           1     44.93 10561.6 3468.3
#> + NumCatalogPurchases  1     43.62 10562.9 3468.6
#> + Education            4     71.32 10535.2 3468.8
#> + NumWebVisitsMonth    1     42.64 10563.8 3468.8
#> + Dt_Customer          1     32.53 10574.0 3470.9
#> + MntMeatProducts      1     12.14 10594.3 3475.2
#> <none>                             10606.5 3475.7
#> + Response             1      3.85 10602.6 3476.9
#> + Recency              1      3.65 10602.8 3477.0
#> + Complain             1      0.42 10606.1 3477.6
#> + Id                   1      0.13 10606.4 3477.7
#> + Marital_Status       7     23.87 10582.6 3484.7
#> 
#> Step:  AIC=3342.92
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + NumStorePurchases    1    417.17 9563.3 3250.3
#> + Kidhome              2    361.40 9619.1 3265.2
#> + MntSweetProducts     1    226.13 9754.3 3294.1
#> + NumWebVisitsMonth    1    120.28 9860.2 3318.0
#> + Teenhome             2    128.79 9851.7 3318.1
#> + Income               1    114.34 9866.1 3319.4
#> + MntFruits            1     99.63 9880.8 3322.7
#> + MntFishProducts      1     70.17 9910.3 3329.3
#> + Year_Birth           1     45.37 9935.1 3334.8
#> + Dt_Customer          1     12.63 9967.8 3342.1
#> + Education            4     36.51 9944.0 3342.8
#> <none>                             9980.5 3342.9
#> + Recency              1      4.94 9975.5 3343.8
#> + MntMeatProducts      1      1.70 9978.8 3344.5
#> + Complain             1      1.16 9979.3 3344.7
#> + Response             1      0.55 9979.9 3344.8
#> + Id                   1      0.15 9980.3 3344.9
#> + NumCatalogPurchases  1      0.06 9980.4 3344.9
#> + Marital_Status       7     34.01 9946.5 3349.4
#> 
#> Step:  AIC=3250.3
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + NumWebVisitsMonth    1    369.04 9194.3 3165.1
#> + Kidhome              2    190.36 9372.9 3209.7
#> + Teenhome             2    107.62 9455.7 3229.2
#> + MntSweetProducts     1     97.76 9465.6 3229.5
#> + Year_Birth           1     37.41 9525.9 3243.6
#> + Income               1     34.44 9528.9 3244.3
#> + MntMeatProducts      1     25.23 9538.1 3246.4
#> + MntFruits            1     20.85 9542.5 3247.5
#> + Response             1     19.47 9543.8 3247.8
#> + Dt_Customer          1     15.76 9547.6 3248.6
#> + NumCatalogPurchases  1     10.67 9552.6 3249.8
#> + MntFishProducts      1      9.94 9553.4 3250.0
#> <none>                             9563.3 3250.3
#> + Recency              1      3.55 9559.8 3251.5
#> + Complain             1      0.61 9562.7 3252.2
#> + Id                   1      0.09 9563.2 3252.3
#> + Education            4     25.78 9537.5 3252.3
#> + Marital_Status       7     33.91 9529.4 3256.4
#> 
#> Step:  AIC=3165.09
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Kidhome              2    317.23 8877.0 3091.3
#> + Income               1    227.67 8966.6 3111.5
#> + MntSweetProducts     1    204.55 8989.7 3117.2
#> + Teenhome             2    103.53 9090.7 3144.0
#> + MntFruits            1     72.22 9122.0 3149.6
#> + Year_Birth           1     66.71 9127.6 3151.0
#> + MntFishProducts      1     60.81 9133.5 3152.4
#> + NumCatalogPurchases  1     26.30 9168.0 3160.7
#> + Response             1     13.02 9181.2 3164.0
#> + Education            4     33.59 9160.7 3165.0
#> <none>                             9194.3 3165.1
#> + MntMeatProducts      1      6.69 9187.6 3165.5
#> + Dt_Customer          1      4.09 9190.2 3166.1
#> + Recency              1      2.18 9192.1 3166.6
#> + Complain             1      0.34 9193.9 3167.0
#> + Id                   1      0.08 9194.2 3167.1
#> + Marital_Status       7     27.83 9166.4 3172.4
#> 
#> Step:  AIC=3091.28
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Income               1   213.803 8663.2 3039.3
#> + MntSweetProducts     1   179.055 8698.0 3048.1
#> + MntFruits            1    61.293 8815.7 3077.9
#> + MntFishProducts      1    50.212 8826.8 3080.7
#> + Teenhome             2    41.697 8835.3 3084.8
#> + Year_Birth           1    17.678 8859.4 3088.9
#> + Response             1    16.634 8860.4 3089.1
#> + Education            4    35.243 8841.8 3090.5
#> + Dt_Customer          1    10.492 8866.5 3090.7
#> <none>                             8877.0 3091.3
#> + NumCatalogPurchases  1     5.379 8871.7 3091.9
#> + MntMeatProducts      1     3.225 8873.8 3092.5
#> + Recency              1     1.229 8875.8 3093.0
#> + Complain             1     0.953 8876.1 3093.1
#> + Id                   1     0.039 8877.0 3093.3
#> + Marital_Status       7    23.664 8853.4 3099.4
#> 
#> Step:  AIC=3039.26
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + MntSweetProducts     1   136.056 8527.2 3006.2
#> + MntFruits            1    39.098 8624.1 3031.2
#> + MntFishProducts      1    32.010 8631.2 3033.1
#> + Teenhome             2    29.172 8634.1 3035.8
#> + Year_Birth           1    12.461 8650.8 3038.1
#> + Response             1    11.426 8651.8 3038.3
#> + Dt_Customer          1    10.945 8652.3 3038.5
#> <none>                             8663.2 3039.3
#> + MntMeatProducts      1     1.257 8662.0 3040.9
#> + Complain             1     1.017 8662.2 3041.0
#> + Recency              1     0.557 8662.7 3041.1
#> + Id                   1     0.057 8663.2 3041.2
#> + NumCatalogPurchases  1     0.003 8663.2 3041.3
#> + Education            4    17.279 8646.0 3042.8
#> + Marital_Status       7    21.205 8642.0 3047.8
#> 
#> Step:  AIC=3006.18
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Teenhome             2    57.478 8469.7 2995.2
#> + Year_Birth           1    21.683 8505.5 3002.5
#> + Dt_Customer          1    20.715 8506.5 3002.8
#> + MntMeatProducts      1    19.558 8507.6 3003.1
#> <none>                             8527.2 3006.2
#> + Response             1     6.705 8520.5 3006.4
#> + NumCatalogPurchases  1     5.179 8522.0 3006.8
#> + MntFruits            1     4.680 8522.5 3007.0
#> + Education            4    27.290 8499.9 3007.1
#> + MntFishProducts      1     2.035 8525.1 3007.7
#> + Complain             1     1.117 8526.1 3007.9
#> + Recency              1     1.066 8526.1 3007.9
#> + Id                   1     0.022 8527.2 3008.2
#> + Marital_Status       7    20.623 8506.5 3014.8
#> 
#> Step:  AIC=2995.19
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Dt_Customer          1   15.6894 8454.0 2993.1
#> + Response             1   15.1688 8454.5 2993.2
#> + MntFruits            1   10.2831 8459.4 2994.5
#> + MntFishProducts      1    7.7193 8462.0 2995.2
#> <none>                             8469.7 2995.2
#> + Year_Birth           1    5.3770 8464.3 2995.8
#> + MntMeatProducts      1    3.8232 8465.9 2996.2
#> + Recency              1    1.3610 8468.3 2996.8
#> + Complain             1    1.0788 8468.6 2996.9
#> + NumCatalogPurchases  1    0.6187 8469.1 2997.0
#> + Id                   1    0.0465 8469.6 2997.2
#> + Education            4   18.8141 8450.9 2998.3
#> + Marital_Status       7   16.7856 8452.9 3004.8
#> 
#> Step:  AIC=2993.08
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Response             1   19.3974 8434.6 2990.0
#> + MntFruits            1   11.1183 8442.9 2992.2
#> + MntFishProducts      1    9.2602 8444.7 2992.7
#> <none>                             8454.0 2993.1
#> + Year_Birth           1    5.1240 8448.9 2993.7
#> + MntMeatProducts      1    2.5636 8451.4 2994.4
#> + Complain             1    1.5395 8452.5 2994.7
#> + Recency              1    1.0405 8453.0 2994.8
#> + NumCatalogPurchases  1    0.2148 8453.8 2995.0
#> + Id                   1    0.0351 8454.0 2995.1
#> + Education            4   16.4910 8437.5 2996.8
#> + Marital_Status       7   16.2500 8437.8 3002.8
#> 
#> Step:  AIC=2989.99
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer + Response
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + MntFruits            1   10.2930 8424.3 2989.3
#> + MntFishProducts      1    9.5043 8425.1 2989.5
#> <none>                             8434.6 2990.0
#> + Year_Birth           1    4.9644 8429.6 2990.7
#> + MntMeatProducts      1    4.3963 8430.2 2990.8
#> + Complain             1    1.4794 8433.1 2991.6
#> + NumCatalogPurchases  1    0.6914 8433.9 2991.8
#> + Id                   1    0.0132 8434.6 2992.0
#> + Recency              1    0.0007 8434.6 2992.0
#> + Education            4   13.3887 8421.2 2994.5
#> + Marital_Status       7   17.5333 8417.1 2999.4
#> 
#> Step:  AIC=2989.29
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer + Response + MntFruits
#> 
#>                       Df Sum of Sq    RSS    AIC
#> <none>                             8424.3 2989.3
#> + MntMeatProducts      1    7.4429 8416.9 2989.3
#> + MntFishProducts      1    5.4199 8418.9 2989.9
#> + Year_Birth           1    5.2340 8419.1 2989.9
#> + Complain             1    1.3382 8423.0 2990.9
#> + NumCatalogPurchases  1    1.2357 8423.1 2991.0
#> + Id                   1    0.0248 8424.3 2991.3
#> + Recency              1    0.0011 8424.3 2991.3
#> + Education            4   14.4745 8409.8 2993.5
#> + Marital_Status       7   18.0344 8406.3 2998.5
summary(model_store_none)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ 1, data = Store_Clean1)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.0853 -2.0853 -0.0853  1.9147 22.9147 
#> 
#> Coefficients:
#>             Estimate Std. Error t value            Pr(>|t|)    
#> (Intercept)  4.08529    0.05823   70.16 <0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.741 on 2215 degrees of freedom
summary(model_forward)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + NumDealsPurchases + 
#>     MntGoldProds + NumStorePurchases + NumWebVisitsMonth + Kidhome + 
#>     Income + MntSweetProducts + Teenhome + Dt_Customer + Response + 
#>     MntFruits, data = Store_Clean1)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.3201 -0.9560 -0.1447  0.9101 23.9763 
#> 
#> Coefficients:
#>                       Estimate   Std. Error t value             Pr(>|t|)    
#> (Intercept)       -8.460004215  3.224269858  -2.624              0.00875 ** 
#> MntWines           0.001990440  0.000186305  10.684 < 0.0000000000000002 ***
#> NumDealsPurchases  0.221399793  0.026909244   8.228 0.000000000000000323 ***
#> MntGoldProds       0.007781722  0.000942617   8.255 0.000000000000000258 ***
#> NumStorePurchases  0.175033884  0.019354607   9.044 < 0.0000000000000002 ***
#> NumWebVisitsMonth  0.346219548  0.024995291  13.851 < 0.0000000000000002 ***
#> Kidhome1          -0.869893458  0.115772225  -7.514 0.000000000000083007 ***
#> Kidhome2          -0.805239062  0.304634404  -2.643              0.00827 ** 
#> Income             0.000014175  0.000002368   5.987 0.000000002486904851 ***
#> MntSweetProducts   0.007596427  0.001333106   5.698 0.000000013724060582 ***
#> Teenhome1          0.396645348  0.097814690   4.055 0.000051863498942279 ***
#> Teenhome2          0.539861619  0.286069018   1.887              0.05927 .  
#> Dt_Customer        0.000462151  0.000198655   2.326              0.02009 *  
#> Response1          0.276195265  0.125384542   2.203              0.02771 *  
#> MntFruits          0.002279765  0.001390198   1.640              0.10117    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.956 on 2201 degrees of freedom
#> Multiple R-squared:  0.4938, Adjusted R-squared:  0.4905 
#> F-statistic: 153.3 on 14 and 2201 DF,  p-value: < 0.00000000000000022

MOdel Both

Kombinasi antara backward dan forward

model_both <- stats::step(object = model_store_none,
                   direction = "both",
                   scope = list(upper = model_all_store))
#> Start:  AIC=4469.81
#> NumWebPurchases ~ 1
#> 
#>                       Df Sum of Sq   RSS    AIC
#> + MntWines             1    5103.4 11538 3660.2
#> + NumStorePurchases    1    4434.9 12206 3785.0
#> + MntGoldProds         1    2757.4 13883 4070.3
#> + Income               1    2503.6 14137 4110.5
#> + NumCatalogPurchases  1    2490.6 14150 4112.5
#> + Kidhome              2    2447.3 14194 4121.3
#> + MntSweetProducts     1    1855.7 14785 4209.8
#> + MntMeatProducts      1    1569.3 15072 4252.3
#> + MntFruits            1    1518.1 15123 4259.8
#> + MntFishProducts      1    1494.6 15146 4263.3
#> + NumDealsPurchases    1     970.1 15671 4338.7
#> + Dt_Customer          1     494.2 16147 4405.0
#> + Teenhome             2     450.7 16190 4413.0
#> + Year_Birth           1     389.8 16251 4419.3
#> + Response             1     381.6 16259 4420.4
#> + Education            4     340.9 16300 4431.9
#> + NumWebVisitsMonth    1      43.7 16597 4466.0
#> <none>                             16641 4469.8
#> + Id                   1       5.7 16635 4471.1
#> + Complain             1       4.6 16636 4471.2
#> + Recency              1       0.5 16640 4471.7
#> + Marital_Status       7      79.5 16561 4473.2
#> 
#> Step:  AIC=3660.17
#> NumWebPurchases ~ MntWines
#> 
#>                       Df Sum of Sq   RSS    AIC
#> + NumDealsPurchases    1     931.0 10606 3475.7
#> + NumStorePurchases    1     738.0 10800 3515.7
#> + MntGoldProds         1     707.1 10830 3522.0
#> + Teenhome             2     454.5 11083 3575.1
#> + NumWebVisitsMonth    1     299.8 11238 3603.8
#> + MntSweetProducts     1     272.3 11265 3609.2
#> + Kidhome              2     225.8 11312 3620.4
#> + MntFruits            1     150.6 11387 3633.1
#> + Dt_Customer          1     137.8 11400 3635.6
#> + MntFishProducts      1     124.7 11413 3638.1
#> + Income               1     113.7 11424 3640.2
#> + Year_Birth           1      71.6 11466 3648.4
#> + Education            4      77.5 11460 3653.2
#> + NumCatalogPurchases  1      34.8 11503 3655.5
#> <none>                             11538 3660.2
#> + Response             1       4.0 11534 3661.4
#> + Recency              1       3.4 11534 3661.5
#> + MntMeatProducts      1       1.5 11536 3661.9
#> + Id                   1       0.8 11537 3662.0
#> + Complain             1       0.5 11537 3662.1
#> + Marital_Status       7      47.4 11490 3665.0
#> - MntWines             1    5103.4 16641 4469.8
#> 
#> Step:  AIC=3475.73
#> NumWebPurchases ~ MntWines + NumDealsPurchases
#> 
#>                       Df Sum of Sq     RSS    AIC
#> + MntGoldProds         1     626.0  9980.5 3342.9
#> + NumStorePurchases    1     617.2  9989.3 3344.9
#> + Kidhome              2     587.7 10018.8 3353.4
#> + MntSweetProducts     1     434.1 10172.4 3385.1
#> + MntFruits            1     289.9 10316.5 3416.3
#> + MntFishProducts      1     264.4 10342.1 3421.8
#> + Income               1     197.4 10409.1 3436.1
#> + Teenhome             2     102.5 10504.0 3458.2
#> + Year_Birth           1      44.9 10561.6 3468.3
#> + NumCatalogPurchases  1      43.6 10562.9 3468.6
#> + Education            4      71.3 10535.2 3468.8
#> + NumWebVisitsMonth    1      42.6 10563.8 3468.8
#> + Dt_Customer          1      32.5 10574.0 3470.9
#> + MntMeatProducts      1      12.1 10594.3 3475.2
#> <none>                             10606.5 3475.7
#> + Response             1       3.8 10602.6 3476.9
#> + Recency              1       3.7 10602.8 3477.0
#> + Complain             1       0.4 10606.1 3477.6
#> + Id                   1       0.1 10606.4 3477.7
#> + Marital_Status       7      23.9 10582.6 3484.7
#> - NumDealsPurchases    1     931.0 11537.5 3660.2
#> - MntWines             1    5064.3 15670.8 4338.7
#> 
#> Step:  AIC=3342.92
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds
#> 
#>                       Df Sum of Sq     RSS    AIC
#> + NumStorePurchases    1    417.17  9563.3 3250.3
#> + Kidhome              2    361.40  9619.1 3265.2
#> + MntSweetProducts     1    226.13  9754.3 3294.1
#> + NumWebVisitsMonth    1    120.28  9860.2 3318.0
#> + Teenhome             2    128.79  9851.7 3318.1
#> + Income               1    114.34  9866.1 3319.4
#> + MntFruits            1     99.63  9880.8 3322.7
#> + MntFishProducts      1     70.17  9910.3 3329.3
#> + Year_Birth           1     45.37  9935.1 3334.8
#> + Dt_Customer          1     12.63  9967.8 3342.1
#> + Education            4     36.51  9944.0 3342.8
#> <none>                              9980.5 3342.9
#> + Recency              1      4.94  9975.5 3343.8
#> + MntMeatProducts      1      1.70  9978.8 3344.5
#> + Complain             1      1.16  9979.3 3344.7
#> + Response             1      0.55  9979.9 3344.8
#> + Id                   1      0.15  9980.3 3344.9
#> + NumCatalogPurchases  1      0.06  9980.4 3344.9
#> + Marital_Status       7     34.01  9946.5 3349.4
#> - MntGoldProds         1    626.01 10606.5 3475.7
#> - NumDealsPurchases    1    849.87 10830.3 3522.0
#> - MntWines             1   3093.09 13073.6 3939.2
#> 
#> Step:  AIC=3250.3
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases
#> 
#>                       Df Sum of Sq     RSS    AIC
#> + NumWebVisitsMonth    1    369.04  9194.3 3165.1
#> + Kidhome              2    190.36  9372.9 3209.7
#> + Teenhome             2    107.62  9455.7 3229.2
#> + MntSweetProducts     1     97.76  9465.6 3229.5
#> + Year_Birth           1     37.41  9525.9 3243.6
#> + Income               1     34.44  9528.9 3244.3
#> + MntMeatProducts      1     25.23  9538.1 3246.4
#> + MntFruits            1     20.85  9542.5 3247.5
#> + Response             1     19.47  9543.8 3247.8
#> + Dt_Customer          1     15.76  9547.6 3248.6
#> + NumCatalogPurchases  1     10.67  9552.6 3249.8
#> + MntFishProducts      1      9.94  9553.4 3250.0
#> <none>                              9563.3 3250.3
#> + Recency              1      3.55  9559.8 3251.5
#> + Complain             1      0.61  9562.7 3252.2
#> + Id                   1      0.09  9563.2 3252.3
#> + Education            4     25.78  9537.5 3252.3
#> + Marital_Status       7     33.91  9529.4 3256.4
#> - NumStorePurchases    1    417.17  9980.5 3342.9
#> - MntGoldProds         1    425.97  9989.3 3344.9
#> - NumDealsPurchases    1    764.84 10328.1 3418.8
#> - MntWines             1   1133.74 10697.0 3496.6
#> 
#> Step:  AIC=3165.09
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth
#> 
#>                       Df Sum of Sq     RSS    AIC
#> + Kidhome              2    317.23  8877.0 3091.3
#> + Income               1    227.67  8966.6 3111.5
#> + MntSweetProducts     1    204.55  8989.7 3117.2
#> + Teenhome             2    103.53  9090.7 3144.0
#> + MntFruits            1     72.22  9122.0 3149.6
#> + Year_Birth           1     66.71  9127.6 3151.0
#> + MntFishProducts      1     60.81  9133.5 3152.4
#> + NumCatalogPurchases  1     26.30  9168.0 3160.7
#> + Response             1     13.02  9181.2 3164.0
#> + Education            4     33.59  9160.7 3165.0
#> <none>                              9194.3 3165.1
#> + MntMeatProducts      1      6.69  9187.6 3165.5
#> + Dt_Customer          1      4.09  9190.2 3166.1
#> + Recency              1      2.18  9192.1 3166.6
#> + Complain             1      0.34  9193.9 3167.0
#> + Id                   1      0.08  9194.2 3167.1
#> + Marital_Status       7     27.83  9166.4 3172.4
#> - NumDealsPurchases    1    290.19  9484.5 3232.0
#> - NumWebVisitsMonth    1    369.04  9563.3 3250.3
#> - MntGoldProds         1    512.06  9706.3 3283.2
#> - NumStorePurchases    1    665.93  9860.2 3318.0
#> - MntWines             1   1170.00 10364.3 3428.5
#> 
#> Step:  AIC=3091.28
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Income               1    213.80 8663.2 3039.3
#> + MntSweetProducts     1    179.05 8698.0 3048.1
#> + MntFruits            1     61.29 8815.7 3077.9
#> + MntFishProducts      1     50.21 8826.8 3080.7
#> + Teenhome             2     41.70 8835.3 3084.9
#> + Year_Birth           1     17.68 8859.4 3088.9
#> + Response             1     16.63 8860.4 3089.1
#> + Education            4     35.24 8841.8 3090.5
#> + Dt_Customer          1     10.49 8866.5 3090.7
#> <none>                             8877.0 3091.3
#> + NumCatalogPurchases  1      5.38 8871.7 3091.9
#> + MntMeatProducts      1      3.23 8873.8 3092.5
#> + Recency              1      1.23 8875.8 3093.0
#> + Complain             1      0.95 8876.1 3093.0
#> + Id                   1      0.04 8877.0 3093.3
#> + Marital_Status       7     23.66 8853.4 3099.4
#> - Kidhome              2    317.23 9194.3 3165.1
#> - MntGoldProds         1    377.46 9254.5 3181.6
#> - NumDealsPurchases    1    423.08 9300.1 3192.5
#> - NumStorePurchases    1    461.96 9339.0 3201.7
#> - NumWebVisitsMonth    1    495.91 9372.9 3209.7
#> - MntWines             1    834.10 9711.1 3288.3
#> 
#> Step:  AIC=3039.26
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + MntSweetProducts     1    136.06 8527.2 3006.2
#> + MntFruits            1     39.10 8624.1 3031.2
#> + MntFishProducts      1     32.01 8631.2 3033.1
#> + Teenhome             2     29.17 8634.1 3035.8
#> + Year_Birth           1     12.46 8650.8 3038.1
#> + Response             1     11.43 8651.8 3038.3
#> + Dt_Customer          1     10.95 8652.3 3038.5
#> <none>                             8663.2 3039.3
#> + MntMeatProducts      1      1.26 8662.0 3040.9
#> + Complain             1      1.02 8662.2 3041.0
#> + Recency              1      0.56 8662.7 3041.1
#> + Id                   1      0.06 8663.2 3041.2
#> + NumCatalogPurchases  1      0.00 8663.2 3041.3
#> + Education            4     17.28 8646.0 3042.8
#> + Marital_Status       7     21.20 8642.0 3047.8
#> - Income               1    213.80 8877.0 3091.3
#> - Kidhome              2    303.36 8966.6 3111.5
#> - MntGoldProds         1    350.57 9013.8 3125.2
#> - NumDealsPurchases    1    388.72 9052.0 3134.5
#> - NumStorePurchases    1    399.36 9062.6 3137.1
#> - MntWines             1    482.53 9145.8 3157.4
#> - NumWebVisitsMonth    1    689.67 9352.9 3207.0
#> 
#> Step:  AIC=3006.18
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Teenhome             2     57.48 8469.7 2995.2
#> + Year_Birth           1     21.68 8505.5 3002.5
#> + Dt_Customer          1     20.72 8506.5 3002.8
#> + MntMeatProducts      1     19.56 8507.6 3003.1
#> <none>                             8527.2 3006.2
#> + Response             1      6.70 8520.5 3006.4
#> + NumCatalogPurchases  1      5.18 8522.0 3006.8
#> + MntFruits            1      4.68 8522.5 3007.0
#> + Education            4     27.29 8499.9 3007.1
#> + MntFishProducts      1      2.03 8525.1 3007.7
#> + Complain             1      1.12 8526.1 3007.9
#> + Recency              1      1.07 8526.1 3007.9
#> + Id                   1      0.02 8527.2 3008.2
#> + Marital_Status       7     20.62 8506.5 3014.8
#> - MntSweetProducts     1    136.06 8663.2 3039.3
#> - Income               1    170.80 8698.0 3048.1
#> - MntGoldProds         1    270.35 8797.5 3073.3
#> - Kidhome              2    282.67 8809.8 3074.4
#> - NumStorePurchases    1    316.40 8843.6 3084.9
#> - NumDealsPurchases    1    419.36 8946.5 3110.6
#> - MntWines             1    466.62 8993.8 3122.2
#> - NumWebVisitsMonth    1    762.43 9289.6 3194.0
#> 
#> Step:  AIC=2995.19
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Dt_Customer          1     15.69 8454.0 2993.1
#> + Response             1     15.17 8454.5 2993.2
#> + MntFruits            1     10.28 8459.4 2994.5
#> + MntFishProducts      1      7.72 8462.0 2995.2
#> <none>                             8469.7 2995.2
#> + Year_Birth           1      5.38 8464.3 2995.8
#> + MntMeatProducts      1      3.82 8465.9 2996.2
#> + Recency              1      1.36 8468.3 2996.8
#> + Complain             1      1.08 8468.6 2996.9
#> + NumCatalogPurchases  1      0.62 8469.1 2997.0
#> + Id                   1      0.05 8469.6 2997.2
#> + Education            4     18.81 8450.9 2998.3
#> + Marital_Status       7     16.79 8452.9 3004.8
#> - Teenhome             2     57.48 8527.2 3006.2
#> - Income               1    150.89 8620.6 3032.3
#> - MntSweetProducts     1    164.36 8634.1 3035.8
#> - Kidhome              2    214.88 8684.6 3046.7
#> - NumDealsPurchases    1    252.87 8722.6 3058.4
#> - MntGoldProds         1    285.36 8755.1 3066.6
#> - NumStorePurchases    1    309.50 8779.2 3072.7
#> - MntWines             1    490.64 8960.3 3118.0
#> - NumWebVisitsMonth    1    738.38 9208.1 3178.4
#> 
#> Step:  AIC=2993.08
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + Response             1     19.40 8434.6 2990.0
#> + MntFruits            1     11.12 8442.9 2992.2
#> + MntFishProducts      1      9.26 8444.7 2992.7
#> <none>                             8454.0 2993.1
#> + Year_Birth           1      5.12 8448.9 2993.7
#> + MntMeatProducts      1      2.56 8451.4 2994.4
#> + Complain             1      1.54 8452.5 2994.7
#> + Recency              1      1.04 8453.0 2994.8
#> + NumCatalogPurchases  1      0.21 8453.8 2995.0
#> + Id                   1      0.04 8454.0 2995.1
#> - Dt_Customer          1     15.69 8469.7 2995.2
#> + Education            4     16.49 8437.5 2996.8
#> - Teenhome             2     52.45 8506.5 3002.8
#> + Marital_Status       7     16.25 8437.8 3002.8
#> - Income               1    150.84 8604.8 3030.3
#> - MntSweetProducts     1    172.21 8626.2 3035.8
#> - Kidhome              2    222.97 8677.0 3046.8
#> - NumDealsPurchases    1    263.33 8717.3 3059.1
#> - MntGoldProds         1    295.20 8749.2 3067.1
#> - NumStorePurchases    1    316.51 8770.5 3072.5
#> - MntWines             1    501.07 8955.1 3118.7
#> - NumWebVisitsMonth    1    733.05 9187.1 3175.4
#> 
#> Step:  AIC=2989.99
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer + Response
#> 
#>                       Df Sum of Sq    RSS    AIC
#> + MntFruits            1     10.29 8424.3 2989.3
#> + MntFishProducts      1      9.50 8425.1 2989.5
#> <none>                             8434.6 2990.0
#> + Year_Birth           1      4.96 8429.6 2990.7
#> + MntMeatProducts      1      4.40 8430.2 2990.8
#> + Complain             1      1.48 8433.1 2991.6
#> + NumCatalogPurchases  1      0.69 8433.9 2991.8
#> + Id                   1      0.01 8434.6 2992.0
#> + Recency              1      0.00 8434.6 2992.0
#> - Response             1     19.40 8454.0 2993.1
#> - Dt_Customer          1     19.92 8454.5 2993.2
#> + Education            4     13.39 8421.2 2994.5
#> + Marital_Status       7     17.53 8417.1 2999.4
#> - Teenhome             2     61.50 8496.1 3002.1
#> - Income               1    144.35 8579.0 3025.6
#> - MntSweetProducts     1    168.83 8603.4 3031.9
#> - Kidhome              2    223.11 8657.7 3043.8
#> - NumDealsPurchases    1    256.77 8691.4 3054.4
#> - MntGoldProds         1    286.22 8720.8 3061.9
#> - NumStorePurchases    1    333.83 8768.4 3074.0
#> - MntWines             1    435.73 8870.3 3099.6
#> - NumWebVisitsMonth    1    724.90 9159.5 3170.7
#> 
#> Step:  AIC=2989.29
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds + 
#>     NumStorePurchases + NumWebVisitsMonth + Kidhome + Income + 
#>     MntSweetProducts + Teenhome + Dt_Customer + Response + MntFruits
#> 
#>                       Df Sum of Sq    RSS    AIC
#> <none>                             8424.3 2989.3
#> + MntMeatProducts      1      7.44 8416.9 2989.3
#> + MntFishProducts      1      5.42 8418.9 2989.9
#> + Year_Birth           1      5.23 8419.1 2989.9
#> - MntFruits            1     10.29 8434.6 2990.0
#> + Complain             1      1.34 8423.0 2990.9
#> + NumCatalogPurchases  1      1.24 8423.1 2991.0
#> + Id                   1      0.02 8424.3 2991.3
#> + Recency              1      0.00 8424.3 2991.3
#> - Response             1     18.57 8442.9 2992.2
#> - Dt_Customer          1     20.71 8445.0 2992.7
#> + Education            4     14.47 8409.8 2993.5
#> + Marital_Status       7     18.03 8406.3 2998.5
#> - Teenhome             2     66.94 8491.3 3002.8
#> - MntSweetProducts     1    124.28 8548.6 3019.7
#> - Income               1    137.20 8561.5 3023.1
#> - Kidhome              2    218.26 8642.6 3042.0
#> - NumDealsPurchases    1    259.10 8683.4 3054.4
#> - MntGoldProds         1    260.85 8685.2 3054.9
#> - NumStorePurchases    1    313.03 8737.3 3068.1
#> - MntWines             1    436.88 8861.2 3099.3
#> - NumWebVisitsMonth    1    734.35 9158.7 3172.5
summary(model_both)
#> 
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + NumDealsPurchases + 
#>     MntGoldProds + NumStorePurchases + NumWebVisitsMonth + Kidhome + 
#>     Income + MntSweetProducts + Teenhome + Dt_Customer + Response + 
#>     MntFruits, data = Store_Clean1)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -9.3201 -0.9560 -0.1447  0.9101 23.9763 
#> 
#> Coefficients:
#>                       Estimate   Std. Error t value             Pr(>|t|)    
#> (Intercept)       -8.460004215  3.224269858  -2.624              0.00875 ** 
#> MntWines           0.001990440  0.000186305  10.684 < 0.0000000000000002 ***
#> NumDealsPurchases  0.221399793  0.026909244   8.228 0.000000000000000323 ***
#> MntGoldProds       0.007781722  0.000942617   8.255 0.000000000000000258 ***
#> NumStorePurchases  0.175033884  0.019354607   9.044 < 0.0000000000000002 ***
#> NumWebVisitsMonth  0.346219548  0.024995291  13.851 < 0.0000000000000002 ***
#> Kidhome1          -0.869893458  0.115772225  -7.514 0.000000000000083007 ***
#> Kidhome2          -0.805239062  0.304634404  -2.643              0.00827 ** 
#> Income             0.000014175  0.000002368   5.987 0.000000002486904851 ***
#> MntSweetProducts   0.007596427  0.001333106   5.698 0.000000013724060582 ***
#> Teenhome1          0.396645348  0.097814690   4.055 0.000051863498942279 ***
#> Teenhome2          0.539861619  0.286069018   1.887              0.05927 .  
#> Dt_Customer        0.000462151  0.000198655   2.326              0.02009 *  
#> Response1          0.276195265  0.125384542   2.203              0.02771 *  
#> MntFruits          0.002279765  0.001390198   1.640              0.10117    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.956 on 2201 degrees of freedom
#> Multiple R-squared:  0.4938, Adjusted R-squared:  0.4905 
#> F-statistic: 153.3 on 14 and 2201 DF,  p-value: < 0.00000000000000022

Model Comparison

Bandingkan nilai Adjusted R-squared untuk ketiga model:

summary(model_backward)$adj.r.squared
#> [1] 0.4909976
summary(model_forward)$adj.r.squared
#> [1] 0.4905378
summary(model_both)$adj.r.squared
#> [1] 0.4905378

Bandingkan performa kelima model baik dari model_store_none, model_all_store maupun model hasil stepwise regression menggunakan fungsi compare_performance dari packages performance

library(performance)
comparison <- compare_performance(model_store_none, model_all_store, model_backward, model_forward, model_both)

as.data.frame(comparison)

prediksi biasa prediksi biasa

# prediksi biasa
pred_model_step <- predict(object = model_backward, 
                           newdata = Store_Clean1)
head(pred_model_step)
#>        1        2        3        4        5        6 
#> 5.887996 4.582988 2.730530 2.080234 2.327834 3.881064

untuk menambahkan batas

atas-bawah

# untuk menambahkan batas atas-bawah
pred_model_step_interval <- predict(
  object = model_backward,
  newdata = Store_Clean1,
  interval = "prediction",
  level = 0.95)

head(pred_model_step_interval)
#>        fit         lwr      upr
#> 1 5.887996  2.01661236 9.759380
#> 2 4.582988  0.73401653 8.431960
#> 3 2.730530 -1.11145960 6.572519
#> 4 2.080234 -1.76427349 5.924742
#> 5 2.327834 -1.52011205 6.175780
#> 6 3.881064  0.01432816 7.747800

Ilustrasi confidence interval untuk MntMeatProducts ~ NumWebPurchases

# install.packages("ggplot2")
# Add predictions
pred.int <- predict(model_clean1, interval = "prediction", level = 0.90)
mydata <- cbind(Store_Clean1, pred.int)

ggplot(data = mydata, aes(x = MntMeatProducts, y = NumWebPurchases )) +
  geom_point()+
  labs(title = "Linear Regression jumlah pembelian melalui situs web perusahaan by Produk Daging") +
  geom_line(aes(y = fit), color = "blue") +
  geom_line(aes(y = lwr), color = "red", linetype = "dashed") +
  geom_line(aes(y = upr), color = "red", linetype = "dashed") +
  theme_minimal() 

# Asumsi Linear Regression ##Linearity

plot(model_backward, which = 1)
abline(h = 10, col = "green")
abline(h = -10, col = "green")

Nilai residual “bounce randomly” di sekitar nilai 0. Kesimpulan: karena garis merah masih ada di dalam cakupan toleransi kita, sehingga model_backward adalah model yang linear.

Normality of Residuals Model linear regression diharapkan menghasilkan error yang berdistribusi normal. Dengan begitu, error lebih banyak berkumpul di sekitar angka nol.

Normality of Residuals

# histogram residual
hist(model_backward$residuals)

Uji statistik dengan `shapiro.test()

# shapiro test dari residual
shapiro.test(model_backward$residuals)
#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  model_backward$residuals
#> W = 0.87072, p-value < 0.00000000000000022

Homoscedasticity of Residuals

plot(x = model_backward$fitted.values, y = model_backward$residuals)
abline(h = 0, col = "red")

No Multicollinearity

library(car)
vif(model_backward)
#>                       GVIF Df GVIF^(1/(2*Df))
#> Income            2.141281  1        1.463311
#> Kidhome           1.896052  2        1.173445
#> Teenhome          1.518718  2        1.110118
#> Dt_Customer       1.244029  1        1.115360
#> MntWines          2.426365  1        1.557679
#> MntFruits         1.936992  1        1.391759
#> MntMeatProducts   2.633190  1        1.622711
#> MntFishProducts   2.080385  1        1.442354
#> MntSweetProducts  1.874324  1        1.369060
#> MntGoldProds      1.420849  1        1.191994
#> NumDealsPurchases 1.586058  1        1.259388
#> NumStorePurchases 2.307276  1        1.518972
#> NumWebVisitsMonth 2.293595  1        1.514462
#> Response          1.176880  1        1.084841

Kesimpulan: Dari uji VIF, prediktor di model_backward lolos uji asumsi multicolinearity (tidak ada nilai VIF > 10)

Uji statistik

bptest() dari package lmtest

# bptest dari model
library(lmtest)
bptest(model_backward)
#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  model_backward
#> BP = 137.21, df = 16, p-value < 0.00000000000000022

Kesimpulan**: karena nilai p-value dari BPtest > 0.05, sehingga gagal tolak H0 (asumsi homoscedasticity terpenuhi).

No Multicollinearity

library(car)
vif(model_backward)
#>                       GVIF Df GVIF^(1/(2*Df))
#> Income            2.141281  1        1.463311
#> Kidhome           1.896052  2        1.173445
#> Teenhome          1.518718  2        1.110118
#> Dt_Customer       1.244029  1        1.115360
#> MntWines          2.426365  1        1.557679
#> MntFruits         1.936992  1        1.391759
#> MntMeatProducts   2.633190  1        1.622711
#> MntFishProducts   2.080385  1        1.442354
#> MntSweetProducts  1.874324  1        1.369060
#> MntGoldProds      1.420849  1        1.191994
#> NumDealsPurchases 1.586058  1        1.259388
#> NumStorePurchases 2.307276  1        1.518972
#> NumWebVisitsMonth 2.293595  1        1.514462
#> Response          1.176880  1        1.084841

Kesimpulan**: Dari uji VIF, prediktor di model_backward lolos uji asumsi multicolinearity (tidak ada nilai VIF > 10)