# clear-up the environment
rm(list = ls())
# chunk options
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
fig.align = "center",
comment = "#>"
)
options(scipen = 99)knitr::include_graphics("assets/store.png")library(tidyverse)
library(caret)
library(plotly)
library(data.table)
library(GGally)
library(tidymodels)
library(car)
library(scales)
library(lmtest)
library(dplyr)store <- read.csv("data_input/superstore_data.csv")
head(store)ID - ID unik dari setiap pelanggan Year_Birth - Usia pelanggan Complain - 1 jika pelanggan melakukan komplain dalam 2 tahun terakhir Dt_Customer - tanggal pendaftaran pelanggan dengan perusahaan Education - tingkat pendidikan pelanggan Marital - status perkawinan pelanggan Kidhome - jumlah anak kecil di rumah pelanggan Teenhome - jumlah remaja dalam rumah tangga pelanggan Income - pendapatan rumah tangga tahunan pelanggan MntFishProducts - jumlah yang dibelanjakan untuk produk ikan dalam 2 tahun terakhir MntMeatProducts - jumlah yang dihabiskan untuk produk daging dalam 2 tahun terakhir MntFruits - jumlah yang dibelanjakan untuk produk buah-buahan dalam 2 tahun terakhir MntSweetProducts - jumlah yang dibelanjakan untuk produk manis dalam 2 tahun terakhir MntWines - jumlah yang dibelanjakan untuk produk anggur dalam 2 tahun terakhir MntGoldProds - jumlah yang dibelanjakan untuk produk emas dalam 2 tahun terakhir NumDealsPurchases - jumlah pembelian yang dilakukan dengan diskon NumCatalogPurchases - jumlah pembelian yang dilakukan menggunakan katalog (membeli barang untuk dikirim melalui pos) NumStorePurchases - jumlah pembelian yang dilakukan langsung di toko NumWebPurchases - jumlah pembelian yang dilakukan melalui situs web perusahaan NumWebVisitsMonth - jumlah kunjungan ke situs web perusahaan dalam sebulan terakhir Recency - jumlah hari sejak pembelian terakhir
glimpse(store)#> Rows: 2,240
#> Columns: 22
#> $ Id <int> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 4047~
#> $ Year_Birth <int> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, 19~
#> $ Education <chr> "Graduation", "Graduation", "Graduation", "Graduat~
#> $ Marital_Status <chr> "Divorced", "Single", "Married", "Together", "Sing~
#> $ Income <int> 84835, 57091, 67267, 32474, 21474, 71691, 63564, 4~
#> $ Kidhome <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,~
#> $ Teenhome <int> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,~
#> $ Dt_Customer <chr> "6/16/2014", "6/15/2014", "5/13/2014", "11/5/2014"~
#> $ Recency <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ MntWines <int> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 450,~
#> $ MntFruits <int> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, 10~
#> $ MntMeatProducts <int> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535, 6~
#> $ MntFishProducts <int> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80, ~
#> $ MntSweetProducts <int> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, 16~
#> $ MntGoldProds <int> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102, 3~
#> $ NumDealsPurchases <int> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, 4,~
#> $ NumWebPurchases <int> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2, ~
#> $ NumCatalogPurchases <int> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1, 1~
#> $ NumStorePurchases <int> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5, 5~
#> $ NumWebVisitsMonth <int> 1, 5, 2, 7, 7, 2, 6, 5, 4, 4, 1, 4, 1, 6, 1, 4, 4,~
#> $ Response <int> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
#> $ Complain <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
menjadi Factor kolom = Education, Marital_Status, Kidhome, Teenhome, Response, Complain
store_Clean <- store %>%
mutate(Dt_Customer = mdy(store$Dt_Customer)) %>%
mutate_at(vars(Education, Marital_Status, Kidhome, Teenhome, Response, Complain), as.factor)anyNA(store_Clean)#> [1] TRUE
colSums(is.na(x = store_Clean))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 24 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
Store_Clean1 <- store_Clean %>% filter(!is.na(Income))colSums(is.na(x = Store_Clean1))#> Id Year_Birth Education Marital_Status
#> 0 0 0 0
#> Income Kidhome Teenhome Dt_Customer
#> 0 0 0 0
#> Recency MntWines MntFruits MntMeatProducts
#> 0 0 0 0
#> MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
#> 0 0 0 0
#> NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
#> 0 0 0 0
#> Response Complain
#> 0 0
Jumlah yang dihabiskan untuk produk daging dalam 2 tahun terakhir (MntMeatProducts)
boxplot(Store_Clean1$MntMeatProducts , horizontal = T)
Persebaran Data Jumlah pembelian yang dilakukan melalui situs web
perusahaan (NumWebPurchases)
boxplot(store_Clean$NumWebPurchases, horizontal = T)cor(y = Store_Clean1$NumWebPurchases, x= Store_Clean1$MntMeatProducts)#> [1] 0.3070904
# visualisasi scatter plot
plot(x = Store_Clean1$MntMeatProducts , y = Store_Clean1$NumWebPurchases)ggcorr(store_Clean, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
Pada grafik korelasi, terlihat bahwa tidak semua variabel memiliki
pengaruh positif terhadap NumWebPurchases dimana faktor MntMeatProducts
memiliki korelasi positif yang paling tinggi dibandingkan faktor-faktor
lain.
Pembuatan Model simple Regresi Linear
model_clean1 <- lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
summary(model_clean1)#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.9324 -1.7991 -0.5788 1.4415 23.5302
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.4585646 0.0691068 50.05 <0.0000000000000002 ***
#> MntMeatProducts 0.0037529 0.0002472 15.18 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.609 on 2214 degrees of freedom
#> Multiple R-squared: 0.0943, Adjusted R-squared: 0.0939
#> F-statistic: 230.5 on 1 and 2214 DF, p-value: < 0.00000000000000022
plot(y = Store_Clean1$NumWebPurchases, x= Store_Clean1$MntMeatProducts)
abline(model_clean1, col = "red")Model Interpretation
summary(model_clean1)#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = Store_Clean1)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.9324 -1.7991 -0.5788 1.4415 23.5302
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.4585646 0.0691068 50.05 <0.0000000000000002 ***
#> MntMeatProducts 0.0037529 0.0002472 15.18 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.609 on 2214 degrees of freedom
#> Multiple R-squared: 0.0943, Adjusted R-squared: 0.0939
#> F-statistic: 230.5 on 1 and 2214 DF, p-value: < 0.00000000000000022
store_no_outlier <- store [store$MntMeatProducts < 1000,]
range(store_no_outlier$MntMeatProducts)#> [1] 0 984
Pembuatan Model tanpa Outlier
model_no_outlier <- lm(NumWebPurchases ~ MntMeatProducts, data = store_no_outlier)
summary(model_no_outlier)#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntMeatProducts, data = store_no_outlier)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.4800 -1.9942 -0.5666 1.4443 23.5968
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.3903011 0.0696175 48.70 <0.0000000000000002 ***
#> MntMeatProducts 0.0043004 0.0002579 16.67 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.618 on 2233 degrees of freedom
#> Multiple R-squared: 0.1107, Adjusted R-squared: 0.1103
#> F-statistic: 278 on 1 and 2233 DF, p-value: < 0.00000000000000022
Bandingkan model yang dibentuk dari model_sales dan
model_no_outlier, apakah data outlier mempengaruhi
garis regresi yang jauh berbeda?
plot(store$MntMeatProducts, store$NumWebPurchases)
abline(model_clean1, col = "red")
abline(model_no_outlier, col = "blue")cek nilai r-squared
# cek nilai r-squared
summary(model_clean1)$r.squared #model dengan outlier#> [1] 0.09430449
summary(model_no_outlier)$r.squared # model tanpa outlier#> [1] 0.110715
Kesimpulan Model tanpa outlier memiliki r-squared lebih tinggi = 0.110715 dengan model tanpa outlier = 0.09430449
Buat model dengan keseluruhan prediksi
model_all_store <- lm(NumWebPurchases~., store_Clean)
summary(model_all_store) #>
#> Call:
#> lm(formula = NumWebPurchases ~ ., data = store_Clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.3529 -0.9637 -0.1509 0.9010 23.7637
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.5853882648 8.6944565182 -0.297 0.76622
#> Id -0.0000002729 0.0000128821 -0.021 0.98310
#> Year_Birth -0.0035361383 0.0039797050 -0.889 0.37435
#> EducationBasic -0.2938676511 0.3070924998 -0.957 0.33871
#> EducationGraduation 0.0550318599 0.1520761007 0.362 0.71748
#> EducationMaster 0.0433357009 0.1771672327 0.245 0.80679
#> EducationPhD 0.1993237966 0.1738623532 1.146 0.25174
#> Marital_StatusAlone 3.1861892497 1.8035787665 1.767 0.07744
#> Marital_StatusDivorced 1.6690236143 1.4087613993 1.185 0.23625
#> Marital_StatusMarried 1.6915117493 1.4045489052 1.204 0.22860
#> Marital_StatusSingle 1.6096473537 1.4051248829 1.146 0.25211
#> Marital_StatusTogether 1.7208509604 1.4051525403 1.225 0.22083
#> Marital_StatusWidow 1.5729772167 1.4205336628 1.107 0.26828
#> Marital_StatusYOLO 2.6866932688 1.9785289767 1.358 0.17463
#> Income 0.0000141856 0.0000024465 5.798 0.000000007667477707
#> Kidhome1 -0.8773656766 0.1202304716 -7.297 0.000000000000408809
#> Kidhome2 -0.8323827823 0.3064047138 -2.717 0.00665
#> Teenhome1 0.3085581387 0.1087095386 2.838 0.00458
#> Teenhome2 0.4276249649 0.2921233143 1.464 0.14338
#> Dt_Customer 0.0004265469 0.0002017842 2.114 0.03464
#> Recency 0.0002166133 0.0014891217 0.145 0.88436
#> MntWines 0.0020002709 0.0002027967 9.863 < 0.0000000000000002
#> MntFruits 0.0022930035 0.0014593869 1.571 0.11628
#> MntMeatProducts -0.0004735203 0.0003251580 -1.456 0.14546
#> MntFishProducts 0.0017930781 0.0011110932 1.614 0.10672
#> MntSweetProducts 0.0075534688 0.0014005875 5.393 0.000000076743683474
#> MntGoldProds 0.0078796751 0.0009801324 8.039 0.000000000000001464
#> NumDealsPurchases 0.2288500501 0.0278598811 8.214 0.000000000000000361
#> NumCatalogPurchases -0.0051399910 0.0248487795 -0.207 0.83615
#> NumStorePurchases 0.1710622355 0.0196050252 8.725 < 0.0000000000000002
#> NumWebVisitsMonth 0.3390451102 0.0266330644 12.730 < 0.0000000000000002
#> Response1 0.2964918904 0.1322867408 2.241 0.02511
#> Complain1 0.2585588708 0.4324802329 0.598 0.55000
#>
#> (Intercept)
#> Id
#> Year_Birth
#> EducationBasic
#> EducationGraduation
#> EducationMaster
#> EducationPhD
#> Marital_StatusAlone .
#> Marital_StatusDivorced
#> Marital_StatusMarried
#> Marital_StatusSingle
#> Marital_StatusTogether
#> Marital_StatusWidow
#> Marital_StatusYOLO
#> Income ***
#> Kidhome1 ***
#> Kidhome2 **
#> Teenhome1 **
#> Teenhome2
#> Dt_Customer *
#> Recency
#> MntWines ***
#> MntFruits
#> MntMeatProducts
#> MntFishProducts
#> MntSweetProducts ***
#> MntGoldProds ***
#> NumDealsPurchases ***
#> NumCatalogPurchases
#> NumStorePurchases ***
#> NumWebVisitsMonth ***
#> Response1 *
#> Complain1
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.958 on 2183 degrees of freedom
#> (24 observations deleted due to missingness)
#> Multiple R-squared: 0.497, Adjusted R-squared: 0.4897
#> F-statistic: 67.41 on 32 and 2183 DF, p-value: < 0.00000000000000022
Buat model dengan data Korelasi
model_cor <- lm(formula = NumWebPurchases ~ MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds, data = Store_Clean1)
model_cor#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + MntFruits + MntMeatProducts +
#> MntFishProducts + MntSweetProducts + MntGoldProds, data = Store_Clean1)
#>
#> Coefficients:
#> (Intercept) MntWines MntFruits MntMeatProducts
#> 2.3844172 0.0039358 0.0022276 -0.0015250
#> MntFishProducts MntSweetProducts MntGoldProds
#> 0.0001332 0.0078853 0.0108705
# simpan hasil prediksi ke kolom baru di dataframe copy yaitu crime_pred
store_pred <- Store_Clean1
store_pred$pred_clean1 <- predict(model_clean1, Store_Clean1)
store_pred$pred_all_store <- predict(model_all_store, Store_Clean1)
store_pred$pred_model_cor <- predict(model_cor, Store_Clean1)
head(store_pred)Tujuan: mendapatkan model terbaik untuk prediksi variabel target.
summary(model_clean1)$r.squared#> [1] 0.09430449
summary(model_all_store)$adj.r.squared#> [1] 0.4896637
summary(model_cor)$adj.r.squared#> [1] 0.3618629
metrik error yang digunakan pada regresi:
library(MLmetrics)
MAE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)#> [1] 1.312026
range(Store_Clean1$MntMeatProducts)#> [1] 0 1725
MAPE menunjukkan seberapa besar penyimpangannya dalam bentuk persentase.
MAPE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)*100#> [1] Inf
selisih kuadrat dari hasil prediksi dan nilai aktual kemudian dirata-rata
MSE(y_pred = store_pred$pred_all_store, y_true = Store_Clean1$NumWebPurchases)#> [1] 3.776966
RMSE(y_pred = store_pred$pred_all_store,
y_true = Store_Clean1$NumWebPurchases)#> [1] 1.943442
range(Store_Clean1$NumWebPurchases)#> [1] 0 27
Step-wise regression membantu kita memilih prediktor yang baik, dengan cara mencari kombinasi prediktor yang menghasilkan model terbaik berdasarkan nilai AIC
#stepwise regression: backward elimination
model_backward <- stats::step(object = model_all_store,
direction = "backward",
trace = FALSE)# summary model backward
summary(model_backward)#>
#> Call:
#> lm(formula = NumWebPurchases ~ Income + Kidhome + Teenhome +
#> Dt_Customer + MntWines + MntFruits + MntMeatProducts + MntFishProducts +
#> MntSweetProducts + MntGoldProds + NumDealsPurchases + NumStorePurchases +
#> NumWebVisitsMonth + Response, data = store_Clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.3212 -0.9458 -0.1334 0.9153 23.9251
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -8.313975021 3.239692525 -2.566 0.010345 *
#> Income 0.000014762 0.000002415 6.112 0.00000000116259593 ***
#> Kidhome1 -0.881187444 0.116349348 -7.574 0.00000000000005309 ***
#> Kidhome2 -0.809106905 0.304560467 -2.657 0.007949 **
#> Teenhome1 0.365722380 0.102975140 3.552 0.000391 ***
#> Teenhome2 0.514444923 0.287667672 1.788 0.073860 .
#> Dt_Customer 0.000455682 0.000199354 2.286 0.022361 *
#> MntWines 0.002062428 0.000191867 10.749 < 0.0000000000000002 ***
#> MntFruits 0.002156572 0.001453187 1.484 0.137945
#> MntMeatProducts -0.000482000 0.000300620 -1.603 0.109001
#> MntFishProducts 0.001564030 0.001094575 1.429 0.153177
#> MntSweetProducts 0.007408438 0.001385002 5.349 0.00000009758678639 ***
#> MntGoldProds 0.007550023 0.000955849 7.899 0.00000000000000441 ***
#> NumDealsPurchases 0.227970099 0.027201507 8.381 < 0.0000000000000002 ***
#> NumStorePurchases 0.171894790 0.019414933 8.854 < 0.0000000000000002 ***
#> NumWebVisitsMonth 0.339253666 0.025945182 13.076 < 0.0000000000000002 ***
#> Response1 0.301035924 0.126114476 2.387 0.017070 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.956 on 2199 degrees of freedom
#> (24 observations deleted due to missingness)
#> Multiple R-squared: 0.4947, Adjusted R-squared: 0.491
#> F-statistic: 134.5 on 16 and 2199 DF, p-value: < 0.00000000000000022
model_store_none <- lm( NumWebPurchases ~ 1, Store_Clean1)
model_forward <- stats::step(object = model_store_none,
direction = "forward",
scope = list(lower = model_store_none, upper = model_all_store))#> Start: AIC=4469.81
#> NumWebPurchases ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + MntWines 1 5103.4 11538 3660.2
#> + NumStorePurchases 1 4434.9 12206 3785.0
#> + MntGoldProds 1 2757.4 13883 4070.3
#> + Income 1 2503.6 14137 4110.5
#> + NumCatalogPurchases 1 2490.6 14150 4112.5
#> + Kidhome 2 2447.3 14194 4121.3
#> + MntSweetProducts 1 1855.7 14785 4209.8
#> + MntMeatProducts 1 1569.3 15072 4252.3
#> + MntFruits 1 1518.1 15123 4259.8
#> + MntFishProducts 1 1494.6 15146 4263.3
#> + NumDealsPurchases 1 970.1 15671 4338.7
#> + Dt_Customer 1 494.2 16147 4405.0
#> + Teenhome 2 450.7 16190 4413.0
#> + Year_Birth 1 389.8 16251 4419.3
#> + Response 1 381.6 16259 4420.4
#> + Education 4 340.9 16300 4431.9
#> + NumWebVisitsMonth 1 43.7 16597 4466.0
#> <none> 16641 4469.8
#> + Id 1 5.7 16635 4471.1
#> + Complain 1 4.6 16636 4471.2
#> + Recency 1 0.5 16640 4471.7
#> + Marital_Status 7 79.5 16561 4473.2
#>
#> Step: AIC=3660.17
#> NumWebPurchases ~ MntWines
#>
#> Df Sum of Sq RSS AIC
#> + NumDealsPurchases 1 930.99 10606 3475.7
#> + NumStorePurchases 1 738.00 10800 3515.7
#> + MntGoldProds 1 707.13 10830 3522.0
#> + Teenhome 2 454.46 11083 3575.1
#> + NumWebVisitsMonth 1 299.82 11238 3603.8
#> + MntSweetProducts 1 272.34 11265 3609.2
#> + Kidhome 2 225.81 11312 3620.4
#> + MntFruits 1 150.57 11387 3633.1
#> + Dt_Customer 1 137.77 11400 3635.6
#> + MntFishProducts 1 124.73 11413 3638.1
#> + Income 1 113.75 11424 3640.2
#> + Year_Birth 1 71.59 11466 3648.4
#> + Education 4 77.50 11460 3653.2
#> + NumCatalogPurchases 1 34.83 11503 3655.5
#> <none> 11538 3660.2
#> + Response 1 4.00 11534 3661.4
#> + Recency 1 3.43 11534 3661.5
#> + MntMeatProducts 1 1.55 11536 3661.9
#> + Id 1 0.77 11537 3662.0
#> + Complain 1 0.45 11537 3662.1
#> + Marital_Status 7 47.43 11490 3665.0
#>
#> Step: AIC=3475.73
#> NumWebPurchases ~ MntWines + NumDealsPurchases
#>
#> Df Sum of Sq RSS AIC
#> + MntGoldProds 1 626.01 9980.5 3342.9
#> + NumStorePurchases 1 617.21 9989.3 3344.9
#> + Kidhome 2 587.67 10018.8 3353.4
#> + MntSweetProducts 1 434.09 10172.4 3385.1
#> + MntFruits 1 289.94 10316.5 3416.3
#> + MntFishProducts 1 264.36 10342.1 3421.8
#> + Income 1 197.38 10409.1 3436.1
#> + Teenhome 2 102.48 10504.0 3458.2
#> + Year_Birth 1 44.93 10561.6 3468.3
#> + NumCatalogPurchases 1 43.62 10562.9 3468.6
#> + Education 4 71.32 10535.2 3468.8
#> + NumWebVisitsMonth 1 42.64 10563.8 3468.8
#> + Dt_Customer 1 32.53 10574.0 3470.9
#> + MntMeatProducts 1 12.14 10594.3 3475.2
#> <none> 10606.5 3475.7
#> + Response 1 3.85 10602.6 3476.9
#> + Recency 1 3.65 10602.8 3477.0
#> + Complain 1 0.42 10606.1 3477.6
#> + Id 1 0.13 10606.4 3477.7
#> + Marital_Status 7 23.87 10582.6 3484.7
#>
#> Step: AIC=3342.92
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds
#>
#> Df Sum of Sq RSS AIC
#> + NumStorePurchases 1 417.17 9563.3 3250.3
#> + Kidhome 2 361.40 9619.1 3265.2
#> + MntSweetProducts 1 226.13 9754.3 3294.1
#> + NumWebVisitsMonth 1 120.28 9860.2 3318.0
#> + Teenhome 2 128.79 9851.7 3318.1
#> + Income 1 114.34 9866.1 3319.4
#> + MntFruits 1 99.63 9880.8 3322.7
#> + MntFishProducts 1 70.17 9910.3 3329.3
#> + Year_Birth 1 45.37 9935.1 3334.8
#> + Dt_Customer 1 12.63 9967.8 3342.1
#> + Education 4 36.51 9944.0 3342.8
#> <none> 9980.5 3342.9
#> + Recency 1 4.94 9975.5 3343.8
#> + MntMeatProducts 1 1.70 9978.8 3344.5
#> + Complain 1 1.16 9979.3 3344.7
#> + Response 1 0.55 9979.9 3344.8
#> + Id 1 0.15 9980.3 3344.9
#> + NumCatalogPurchases 1 0.06 9980.4 3344.9
#> + Marital_Status 7 34.01 9946.5 3349.4
#>
#> Step: AIC=3250.3
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases
#>
#> Df Sum of Sq RSS AIC
#> + NumWebVisitsMonth 1 369.04 9194.3 3165.1
#> + Kidhome 2 190.36 9372.9 3209.7
#> + Teenhome 2 107.62 9455.7 3229.2
#> + MntSweetProducts 1 97.76 9465.6 3229.5
#> + Year_Birth 1 37.41 9525.9 3243.6
#> + Income 1 34.44 9528.9 3244.3
#> + MntMeatProducts 1 25.23 9538.1 3246.4
#> + MntFruits 1 20.85 9542.5 3247.5
#> + Response 1 19.47 9543.8 3247.8
#> + Dt_Customer 1 15.76 9547.6 3248.6
#> + NumCatalogPurchases 1 10.67 9552.6 3249.8
#> + MntFishProducts 1 9.94 9553.4 3250.0
#> <none> 9563.3 3250.3
#> + Recency 1 3.55 9559.8 3251.5
#> + Complain 1 0.61 9562.7 3252.2
#> + Id 1 0.09 9563.2 3252.3
#> + Education 4 25.78 9537.5 3252.3
#> + Marital_Status 7 33.91 9529.4 3256.4
#>
#> Step: AIC=3165.09
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth
#>
#> Df Sum of Sq RSS AIC
#> + Kidhome 2 317.23 8877.0 3091.3
#> + Income 1 227.67 8966.6 3111.5
#> + MntSweetProducts 1 204.55 8989.7 3117.2
#> + Teenhome 2 103.53 9090.7 3144.0
#> + MntFruits 1 72.22 9122.0 3149.6
#> + Year_Birth 1 66.71 9127.6 3151.0
#> + MntFishProducts 1 60.81 9133.5 3152.4
#> + NumCatalogPurchases 1 26.30 9168.0 3160.7
#> + Response 1 13.02 9181.2 3164.0
#> + Education 4 33.59 9160.7 3165.0
#> <none> 9194.3 3165.1
#> + MntMeatProducts 1 6.69 9187.6 3165.5
#> + Dt_Customer 1 4.09 9190.2 3166.1
#> + Recency 1 2.18 9192.1 3166.6
#> + Complain 1 0.34 9193.9 3167.0
#> + Id 1 0.08 9194.2 3167.1
#> + Marital_Status 7 27.83 9166.4 3172.4
#>
#> Step: AIC=3091.28
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome
#>
#> Df Sum of Sq RSS AIC
#> + Income 1 213.803 8663.2 3039.3
#> + MntSweetProducts 1 179.055 8698.0 3048.1
#> + MntFruits 1 61.293 8815.7 3077.9
#> + MntFishProducts 1 50.212 8826.8 3080.7
#> + Teenhome 2 41.697 8835.3 3084.8
#> + Year_Birth 1 17.678 8859.4 3088.9
#> + Response 1 16.634 8860.4 3089.1
#> + Education 4 35.243 8841.8 3090.5
#> + Dt_Customer 1 10.492 8866.5 3090.7
#> <none> 8877.0 3091.3
#> + NumCatalogPurchases 1 5.379 8871.7 3091.9
#> + MntMeatProducts 1 3.225 8873.8 3092.5
#> + Recency 1 1.229 8875.8 3093.0
#> + Complain 1 0.953 8876.1 3093.1
#> + Id 1 0.039 8877.0 3093.3
#> + Marital_Status 7 23.664 8853.4 3099.4
#>
#> Step: AIC=3039.26
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income
#>
#> Df Sum of Sq RSS AIC
#> + MntSweetProducts 1 136.056 8527.2 3006.2
#> + MntFruits 1 39.098 8624.1 3031.2
#> + MntFishProducts 1 32.010 8631.2 3033.1
#> + Teenhome 2 29.172 8634.1 3035.8
#> + Year_Birth 1 12.461 8650.8 3038.1
#> + Response 1 11.426 8651.8 3038.3
#> + Dt_Customer 1 10.945 8652.3 3038.5
#> <none> 8663.2 3039.3
#> + MntMeatProducts 1 1.257 8662.0 3040.9
#> + Complain 1 1.017 8662.2 3041.0
#> + Recency 1 0.557 8662.7 3041.1
#> + Id 1 0.057 8663.2 3041.2
#> + NumCatalogPurchases 1 0.003 8663.2 3041.3
#> + Education 4 17.279 8646.0 3042.8
#> + Marital_Status 7 21.205 8642.0 3047.8
#>
#> Step: AIC=3006.18
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts
#>
#> Df Sum of Sq RSS AIC
#> + Teenhome 2 57.478 8469.7 2995.2
#> + Year_Birth 1 21.683 8505.5 3002.5
#> + Dt_Customer 1 20.715 8506.5 3002.8
#> + MntMeatProducts 1 19.558 8507.6 3003.1
#> <none> 8527.2 3006.2
#> + Response 1 6.705 8520.5 3006.4
#> + NumCatalogPurchases 1 5.179 8522.0 3006.8
#> + MntFruits 1 4.680 8522.5 3007.0
#> + Education 4 27.290 8499.9 3007.1
#> + MntFishProducts 1 2.035 8525.1 3007.7
#> + Complain 1 1.117 8526.1 3007.9
#> + Recency 1 1.066 8526.1 3007.9
#> + Id 1 0.022 8527.2 3008.2
#> + Marital_Status 7 20.623 8506.5 3014.8
#>
#> Step: AIC=2995.19
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome
#>
#> Df Sum of Sq RSS AIC
#> + Dt_Customer 1 15.6894 8454.0 2993.1
#> + Response 1 15.1688 8454.5 2993.2
#> + MntFruits 1 10.2831 8459.4 2994.5
#> + MntFishProducts 1 7.7193 8462.0 2995.2
#> <none> 8469.7 2995.2
#> + Year_Birth 1 5.3770 8464.3 2995.8
#> + MntMeatProducts 1 3.8232 8465.9 2996.2
#> + Recency 1 1.3610 8468.3 2996.8
#> + Complain 1 1.0788 8468.6 2996.9
#> + NumCatalogPurchases 1 0.6187 8469.1 2997.0
#> + Id 1 0.0465 8469.6 2997.2
#> + Education 4 18.8141 8450.9 2998.3
#> + Marital_Status 7 16.7856 8452.9 3004.8
#>
#> Step: AIC=2993.08
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer
#>
#> Df Sum of Sq RSS AIC
#> + Response 1 19.3974 8434.6 2990.0
#> + MntFruits 1 11.1183 8442.9 2992.2
#> + MntFishProducts 1 9.2602 8444.7 2992.7
#> <none> 8454.0 2993.1
#> + Year_Birth 1 5.1240 8448.9 2993.7
#> + MntMeatProducts 1 2.5636 8451.4 2994.4
#> + Complain 1 1.5395 8452.5 2994.7
#> + Recency 1 1.0405 8453.0 2994.8
#> + NumCatalogPurchases 1 0.2148 8453.8 2995.0
#> + Id 1 0.0351 8454.0 2995.1
#> + Education 4 16.4910 8437.5 2996.8
#> + Marital_Status 7 16.2500 8437.8 3002.8
#>
#> Step: AIC=2989.99
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer + Response
#>
#> Df Sum of Sq RSS AIC
#> + MntFruits 1 10.2930 8424.3 2989.3
#> + MntFishProducts 1 9.5043 8425.1 2989.5
#> <none> 8434.6 2990.0
#> + Year_Birth 1 4.9644 8429.6 2990.7
#> + MntMeatProducts 1 4.3963 8430.2 2990.8
#> + Complain 1 1.4794 8433.1 2991.6
#> + NumCatalogPurchases 1 0.6914 8433.9 2991.8
#> + Id 1 0.0132 8434.6 2992.0
#> + Recency 1 0.0007 8434.6 2992.0
#> + Education 4 13.3887 8421.2 2994.5
#> + Marital_Status 7 17.5333 8417.1 2999.4
#>
#> Step: AIC=2989.29
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer + Response + MntFruits
#>
#> Df Sum of Sq RSS AIC
#> <none> 8424.3 2989.3
#> + MntMeatProducts 1 7.4429 8416.9 2989.3
#> + MntFishProducts 1 5.4199 8418.9 2989.9
#> + Year_Birth 1 5.2340 8419.1 2989.9
#> + Complain 1 1.3382 8423.0 2990.9
#> + NumCatalogPurchases 1 1.2357 8423.1 2991.0
#> + Id 1 0.0248 8424.3 2991.3
#> + Recency 1 0.0011 8424.3 2991.3
#> + Education 4 14.4745 8409.8 2993.5
#> + Marital_Status 7 18.0344 8406.3 2998.5
summary(model_store_none)#>
#> Call:
#> lm(formula = NumWebPurchases ~ 1, data = Store_Clean1)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.0853 -2.0853 -0.0853 1.9147 22.9147
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 4.08529 0.05823 70.16 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.741 on 2215 degrees of freedom
summary(model_forward)#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + NumDealsPurchases +
#> MntGoldProds + NumStorePurchases + NumWebVisitsMonth + Kidhome +
#> Income + MntSweetProducts + Teenhome + Dt_Customer + Response +
#> MntFruits, data = Store_Clean1)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.3201 -0.9560 -0.1447 0.9101 23.9763
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -8.460004215 3.224269858 -2.624 0.00875 **
#> MntWines 0.001990440 0.000186305 10.684 < 0.0000000000000002 ***
#> NumDealsPurchases 0.221399793 0.026909244 8.228 0.000000000000000323 ***
#> MntGoldProds 0.007781722 0.000942617 8.255 0.000000000000000258 ***
#> NumStorePurchases 0.175033884 0.019354607 9.044 < 0.0000000000000002 ***
#> NumWebVisitsMonth 0.346219548 0.024995291 13.851 < 0.0000000000000002 ***
#> Kidhome1 -0.869893458 0.115772225 -7.514 0.000000000000083007 ***
#> Kidhome2 -0.805239062 0.304634404 -2.643 0.00827 **
#> Income 0.000014175 0.000002368 5.987 0.000000002486904851 ***
#> MntSweetProducts 0.007596427 0.001333106 5.698 0.000000013724060582 ***
#> Teenhome1 0.396645348 0.097814690 4.055 0.000051863498942279 ***
#> Teenhome2 0.539861619 0.286069018 1.887 0.05927 .
#> Dt_Customer 0.000462151 0.000198655 2.326 0.02009 *
#> Response1 0.276195265 0.125384542 2.203 0.02771 *
#> MntFruits 0.002279765 0.001390198 1.640 0.10117
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.956 on 2201 degrees of freedom
#> Multiple R-squared: 0.4938, Adjusted R-squared: 0.4905
#> F-statistic: 153.3 on 14 and 2201 DF, p-value: < 0.00000000000000022
Kombinasi antara backward dan forward
model_both <- stats::step(object = model_store_none,
direction = "both",
scope = list(upper = model_all_store))#> Start: AIC=4469.81
#> NumWebPurchases ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + MntWines 1 5103.4 11538 3660.2
#> + NumStorePurchases 1 4434.9 12206 3785.0
#> + MntGoldProds 1 2757.4 13883 4070.3
#> + Income 1 2503.6 14137 4110.5
#> + NumCatalogPurchases 1 2490.6 14150 4112.5
#> + Kidhome 2 2447.3 14194 4121.3
#> + MntSweetProducts 1 1855.7 14785 4209.8
#> + MntMeatProducts 1 1569.3 15072 4252.3
#> + MntFruits 1 1518.1 15123 4259.8
#> + MntFishProducts 1 1494.6 15146 4263.3
#> + NumDealsPurchases 1 970.1 15671 4338.7
#> + Dt_Customer 1 494.2 16147 4405.0
#> + Teenhome 2 450.7 16190 4413.0
#> + Year_Birth 1 389.8 16251 4419.3
#> + Response 1 381.6 16259 4420.4
#> + Education 4 340.9 16300 4431.9
#> + NumWebVisitsMonth 1 43.7 16597 4466.0
#> <none> 16641 4469.8
#> + Id 1 5.7 16635 4471.1
#> + Complain 1 4.6 16636 4471.2
#> + Recency 1 0.5 16640 4471.7
#> + Marital_Status 7 79.5 16561 4473.2
#>
#> Step: AIC=3660.17
#> NumWebPurchases ~ MntWines
#>
#> Df Sum of Sq RSS AIC
#> + NumDealsPurchases 1 931.0 10606 3475.7
#> + NumStorePurchases 1 738.0 10800 3515.7
#> + MntGoldProds 1 707.1 10830 3522.0
#> + Teenhome 2 454.5 11083 3575.1
#> + NumWebVisitsMonth 1 299.8 11238 3603.8
#> + MntSweetProducts 1 272.3 11265 3609.2
#> + Kidhome 2 225.8 11312 3620.4
#> + MntFruits 1 150.6 11387 3633.1
#> + Dt_Customer 1 137.8 11400 3635.6
#> + MntFishProducts 1 124.7 11413 3638.1
#> + Income 1 113.7 11424 3640.2
#> + Year_Birth 1 71.6 11466 3648.4
#> + Education 4 77.5 11460 3653.2
#> + NumCatalogPurchases 1 34.8 11503 3655.5
#> <none> 11538 3660.2
#> + Response 1 4.0 11534 3661.4
#> + Recency 1 3.4 11534 3661.5
#> + MntMeatProducts 1 1.5 11536 3661.9
#> + Id 1 0.8 11537 3662.0
#> + Complain 1 0.5 11537 3662.1
#> + Marital_Status 7 47.4 11490 3665.0
#> - MntWines 1 5103.4 16641 4469.8
#>
#> Step: AIC=3475.73
#> NumWebPurchases ~ MntWines + NumDealsPurchases
#>
#> Df Sum of Sq RSS AIC
#> + MntGoldProds 1 626.0 9980.5 3342.9
#> + NumStorePurchases 1 617.2 9989.3 3344.9
#> + Kidhome 2 587.7 10018.8 3353.4
#> + MntSweetProducts 1 434.1 10172.4 3385.1
#> + MntFruits 1 289.9 10316.5 3416.3
#> + MntFishProducts 1 264.4 10342.1 3421.8
#> + Income 1 197.4 10409.1 3436.1
#> + Teenhome 2 102.5 10504.0 3458.2
#> + Year_Birth 1 44.9 10561.6 3468.3
#> + NumCatalogPurchases 1 43.6 10562.9 3468.6
#> + Education 4 71.3 10535.2 3468.8
#> + NumWebVisitsMonth 1 42.6 10563.8 3468.8
#> + Dt_Customer 1 32.5 10574.0 3470.9
#> + MntMeatProducts 1 12.1 10594.3 3475.2
#> <none> 10606.5 3475.7
#> + Response 1 3.8 10602.6 3476.9
#> + Recency 1 3.7 10602.8 3477.0
#> + Complain 1 0.4 10606.1 3477.6
#> + Id 1 0.1 10606.4 3477.7
#> + Marital_Status 7 23.9 10582.6 3484.7
#> - NumDealsPurchases 1 931.0 11537.5 3660.2
#> - MntWines 1 5064.3 15670.8 4338.7
#>
#> Step: AIC=3342.92
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds
#>
#> Df Sum of Sq RSS AIC
#> + NumStorePurchases 1 417.17 9563.3 3250.3
#> + Kidhome 2 361.40 9619.1 3265.2
#> + MntSweetProducts 1 226.13 9754.3 3294.1
#> + NumWebVisitsMonth 1 120.28 9860.2 3318.0
#> + Teenhome 2 128.79 9851.7 3318.1
#> + Income 1 114.34 9866.1 3319.4
#> + MntFruits 1 99.63 9880.8 3322.7
#> + MntFishProducts 1 70.17 9910.3 3329.3
#> + Year_Birth 1 45.37 9935.1 3334.8
#> + Dt_Customer 1 12.63 9967.8 3342.1
#> + Education 4 36.51 9944.0 3342.8
#> <none> 9980.5 3342.9
#> + Recency 1 4.94 9975.5 3343.8
#> + MntMeatProducts 1 1.70 9978.8 3344.5
#> + Complain 1 1.16 9979.3 3344.7
#> + Response 1 0.55 9979.9 3344.8
#> + Id 1 0.15 9980.3 3344.9
#> + NumCatalogPurchases 1 0.06 9980.4 3344.9
#> + Marital_Status 7 34.01 9946.5 3349.4
#> - MntGoldProds 1 626.01 10606.5 3475.7
#> - NumDealsPurchases 1 849.87 10830.3 3522.0
#> - MntWines 1 3093.09 13073.6 3939.2
#>
#> Step: AIC=3250.3
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases
#>
#> Df Sum of Sq RSS AIC
#> + NumWebVisitsMonth 1 369.04 9194.3 3165.1
#> + Kidhome 2 190.36 9372.9 3209.7
#> + Teenhome 2 107.62 9455.7 3229.2
#> + MntSweetProducts 1 97.76 9465.6 3229.5
#> + Year_Birth 1 37.41 9525.9 3243.6
#> + Income 1 34.44 9528.9 3244.3
#> + MntMeatProducts 1 25.23 9538.1 3246.4
#> + MntFruits 1 20.85 9542.5 3247.5
#> + Response 1 19.47 9543.8 3247.8
#> + Dt_Customer 1 15.76 9547.6 3248.6
#> + NumCatalogPurchases 1 10.67 9552.6 3249.8
#> + MntFishProducts 1 9.94 9553.4 3250.0
#> <none> 9563.3 3250.3
#> + Recency 1 3.55 9559.8 3251.5
#> + Complain 1 0.61 9562.7 3252.2
#> + Id 1 0.09 9563.2 3252.3
#> + Education 4 25.78 9537.5 3252.3
#> + Marital_Status 7 33.91 9529.4 3256.4
#> - NumStorePurchases 1 417.17 9980.5 3342.9
#> - MntGoldProds 1 425.97 9989.3 3344.9
#> - NumDealsPurchases 1 764.84 10328.1 3418.8
#> - MntWines 1 1133.74 10697.0 3496.6
#>
#> Step: AIC=3165.09
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth
#>
#> Df Sum of Sq RSS AIC
#> + Kidhome 2 317.23 8877.0 3091.3
#> + Income 1 227.67 8966.6 3111.5
#> + MntSweetProducts 1 204.55 8989.7 3117.2
#> + Teenhome 2 103.53 9090.7 3144.0
#> + MntFruits 1 72.22 9122.0 3149.6
#> + Year_Birth 1 66.71 9127.6 3151.0
#> + MntFishProducts 1 60.81 9133.5 3152.4
#> + NumCatalogPurchases 1 26.30 9168.0 3160.7
#> + Response 1 13.02 9181.2 3164.0
#> + Education 4 33.59 9160.7 3165.0
#> <none> 9194.3 3165.1
#> + MntMeatProducts 1 6.69 9187.6 3165.5
#> + Dt_Customer 1 4.09 9190.2 3166.1
#> + Recency 1 2.18 9192.1 3166.6
#> + Complain 1 0.34 9193.9 3167.0
#> + Id 1 0.08 9194.2 3167.1
#> + Marital_Status 7 27.83 9166.4 3172.4
#> - NumDealsPurchases 1 290.19 9484.5 3232.0
#> - NumWebVisitsMonth 1 369.04 9563.3 3250.3
#> - MntGoldProds 1 512.06 9706.3 3283.2
#> - NumStorePurchases 1 665.93 9860.2 3318.0
#> - MntWines 1 1170.00 10364.3 3428.5
#>
#> Step: AIC=3091.28
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome
#>
#> Df Sum of Sq RSS AIC
#> + Income 1 213.80 8663.2 3039.3
#> + MntSweetProducts 1 179.05 8698.0 3048.1
#> + MntFruits 1 61.29 8815.7 3077.9
#> + MntFishProducts 1 50.21 8826.8 3080.7
#> + Teenhome 2 41.70 8835.3 3084.9
#> + Year_Birth 1 17.68 8859.4 3088.9
#> + Response 1 16.63 8860.4 3089.1
#> + Education 4 35.24 8841.8 3090.5
#> + Dt_Customer 1 10.49 8866.5 3090.7
#> <none> 8877.0 3091.3
#> + NumCatalogPurchases 1 5.38 8871.7 3091.9
#> + MntMeatProducts 1 3.23 8873.8 3092.5
#> + Recency 1 1.23 8875.8 3093.0
#> + Complain 1 0.95 8876.1 3093.0
#> + Id 1 0.04 8877.0 3093.3
#> + Marital_Status 7 23.66 8853.4 3099.4
#> - Kidhome 2 317.23 9194.3 3165.1
#> - MntGoldProds 1 377.46 9254.5 3181.6
#> - NumDealsPurchases 1 423.08 9300.1 3192.5
#> - NumStorePurchases 1 461.96 9339.0 3201.7
#> - NumWebVisitsMonth 1 495.91 9372.9 3209.7
#> - MntWines 1 834.10 9711.1 3288.3
#>
#> Step: AIC=3039.26
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income
#>
#> Df Sum of Sq RSS AIC
#> + MntSweetProducts 1 136.06 8527.2 3006.2
#> + MntFruits 1 39.10 8624.1 3031.2
#> + MntFishProducts 1 32.01 8631.2 3033.1
#> + Teenhome 2 29.17 8634.1 3035.8
#> + Year_Birth 1 12.46 8650.8 3038.1
#> + Response 1 11.43 8651.8 3038.3
#> + Dt_Customer 1 10.95 8652.3 3038.5
#> <none> 8663.2 3039.3
#> + MntMeatProducts 1 1.26 8662.0 3040.9
#> + Complain 1 1.02 8662.2 3041.0
#> + Recency 1 0.56 8662.7 3041.1
#> + Id 1 0.06 8663.2 3041.2
#> + NumCatalogPurchases 1 0.00 8663.2 3041.3
#> + Education 4 17.28 8646.0 3042.8
#> + Marital_Status 7 21.20 8642.0 3047.8
#> - Income 1 213.80 8877.0 3091.3
#> - Kidhome 2 303.36 8966.6 3111.5
#> - MntGoldProds 1 350.57 9013.8 3125.2
#> - NumDealsPurchases 1 388.72 9052.0 3134.5
#> - NumStorePurchases 1 399.36 9062.6 3137.1
#> - MntWines 1 482.53 9145.8 3157.4
#> - NumWebVisitsMonth 1 689.67 9352.9 3207.0
#>
#> Step: AIC=3006.18
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts
#>
#> Df Sum of Sq RSS AIC
#> + Teenhome 2 57.48 8469.7 2995.2
#> + Year_Birth 1 21.68 8505.5 3002.5
#> + Dt_Customer 1 20.72 8506.5 3002.8
#> + MntMeatProducts 1 19.56 8507.6 3003.1
#> <none> 8527.2 3006.2
#> + Response 1 6.70 8520.5 3006.4
#> + NumCatalogPurchases 1 5.18 8522.0 3006.8
#> + MntFruits 1 4.68 8522.5 3007.0
#> + Education 4 27.29 8499.9 3007.1
#> + MntFishProducts 1 2.03 8525.1 3007.7
#> + Complain 1 1.12 8526.1 3007.9
#> + Recency 1 1.07 8526.1 3007.9
#> + Id 1 0.02 8527.2 3008.2
#> + Marital_Status 7 20.62 8506.5 3014.8
#> - MntSweetProducts 1 136.06 8663.2 3039.3
#> - Income 1 170.80 8698.0 3048.1
#> - MntGoldProds 1 270.35 8797.5 3073.3
#> - Kidhome 2 282.67 8809.8 3074.4
#> - NumStorePurchases 1 316.40 8843.6 3084.9
#> - NumDealsPurchases 1 419.36 8946.5 3110.6
#> - MntWines 1 466.62 8993.8 3122.2
#> - NumWebVisitsMonth 1 762.43 9289.6 3194.0
#>
#> Step: AIC=2995.19
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome
#>
#> Df Sum of Sq RSS AIC
#> + Dt_Customer 1 15.69 8454.0 2993.1
#> + Response 1 15.17 8454.5 2993.2
#> + MntFruits 1 10.28 8459.4 2994.5
#> + MntFishProducts 1 7.72 8462.0 2995.2
#> <none> 8469.7 2995.2
#> + Year_Birth 1 5.38 8464.3 2995.8
#> + MntMeatProducts 1 3.82 8465.9 2996.2
#> + Recency 1 1.36 8468.3 2996.8
#> + Complain 1 1.08 8468.6 2996.9
#> + NumCatalogPurchases 1 0.62 8469.1 2997.0
#> + Id 1 0.05 8469.6 2997.2
#> + Education 4 18.81 8450.9 2998.3
#> + Marital_Status 7 16.79 8452.9 3004.8
#> - Teenhome 2 57.48 8527.2 3006.2
#> - Income 1 150.89 8620.6 3032.3
#> - MntSweetProducts 1 164.36 8634.1 3035.8
#> - Kidhome 2 214.88 8684.6 3046.7
#> - NumDealsPurchases 1 252.87 8722.6 3058.4
#> - MntGoldProds 1 285.36 8755.1 3066.6
#> - NumStorePurchases 1 309.50 8779.2 3072.7
#> - MntWines 1 490.64 8960.3 3118.0
#> - NumWebVisitsMonth 1 738.38 9208.1 3178.4
#>
#> Step: AIC=2993.08
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer
#>
#> Df Sum of Sq RSS AIC
#> + Response 1 19.40 8434.6 2990.0
#> + MntFruits 1 11.12 8442.9 2992.2
#> + MntFishProducts 1 9.26 8444.7 2992.7
#> <none> 8454.0 2993.1
#> + Year_Birth 1 5.12 8448.9 2993.7
#> + MntMeatProducts 1 2.56 8451.4 2994.4
#> + Complain 1 1.54 8452.5 2994.7
#> + Recency 1 1.04 8453.0 2994.8
#> + NumCatalogPurchases 1 0.21 8453.8 2995.0
#> + Id 1 0.04 8454.0 2995.1
#> - Dt_Customer 1 15.69 8469.7 2995.2
#> + Education 4 16.49 8437.5 2996.8
#> - Teenhome 2 52.45 8506.5 3002.8
#> + Marital_Status 7 16.25 8437.8 3002.8
#> - Income 1 150.84 8604.8 3030.3
#> - MntSweetProducts 1 172.21 8626.2 3035.8
#> - Kidhome 2 222.97 8677.0 3046.8
#> - NumDealsPurchases 1 263.33 8717.3 3059.1
#> - MntGoldProds 1 295.20 8749.2 3067.1
#> - NumStorePurchases 1 316.51 8770.5 3072.5
#> - MntWines 1 501.07 8955.1 3118.7
#> - NumWebVisitsMonth 1 733.05 9187.1 3175.4
#>
#> Step: AIC=2989.99
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer + Response
#>
#> Df Sum of Sq RSS AIC
#> + MntFruits 1 10.29 8424.3 2989.3
#> + MntFishProducts 1 9.50 8425.1 2989.5
#> <none> 8434.6 2990.0
#> + Year_Birth 1 4.96 8429.6 2990.7
#> + MntMeatProducts 1 4.40 8430.2 2990.8
#> + Complain 1 1.48 8433.1 2991.6
#> + NumCatalogPurchases 1 0.69 8433.9 2991.8
#> + Id 1 0.01 8434.6 2992.0
#> + Recency 1 0.00 8434.6 2992.0
#> - Response 1 19.40 8454.0 2993.1
#> - Dt_Customer 1 19.92 8454.5 2993.2
#> + Education 4 13.39 8421.2 2994.5
#> + Marital_Status 7 17.53 8417.1 2999.4
#> - Teenhome 2 61.50 8496.1 3002.1
#> - Income 1 144.35 8579.0 3025.6
#> - MntSweetProducts 1 168.83 8603.4 3031.9
#> - Kidhome 2 223.11 8657.7 3043.8
#> - NumDealsPurchases 1 256.77 8691.4 3054.4
#> - MntGoldProds 1 286.22 8720.8 3061.9
#> - NumStorePurchases 1 333.83 8768.4 3074.0
#> - MntWines 1 435.73 8870.3 3099.6
#> - NumWebVisitsMonth 1 724.90 9159.5 3170.7
#>
#> Step: AIC=2989.29
#> NumWebPurchases ~ MntWines + NumDealsPurchases + MntGoldProds +
#> NumStorePurchases + NumWebVisitsMonth + Kidhome + Income +
#> MntSweetProducts + Teenhome + Dt_Customer + Response + MntFruits
#>
#> Df Sum of Sq RSS AIC
#> <none> 8424.3 2989.3
#> + MntMeatProducts 1 7.44 8416.9 2989.3
#> + MntFishProducts 1 5.42 8418.9 2989.9
#> + Year_Birth 1 5.23 8419.1 2989.9
#> - MntFruits 1 10.29 8434.6 2990.0
#> + Complain 1 1.34 8423.0 2990.9
#> + NumCatalogPurchases 1 1.24 8423.1 2991.0
#> + Id 1 0.02 8424.3 2991.3
#> + Recency 1 0.00 8424.3 2991.3
#> - Response 1 18.57 8442.9 2992.2
#> - Dt_Customer 1 20.71 8445.0 2992.7
#> + Education 4 14.47 8409.8 2993.5
#> + Marital_Status 7 18.03 8406.3 2998.5
#> - Teenhome 2 66.94 8491.3 3002.8
#> - MntSweetProducts 1 124.28 8548.6 3019.7
#> - Income 1 137.20 8561.5 3023.1
#> - Kidhome 2 218.26 8642.6 3042.0
#> - NumDealsPurchases 1 259.10 8683.4 3054.4
#> - MntGoldProds 1 260.85 8685.2 3054.9
#> - NumStorePurchases 1 313.03 8737.3 3068.1
#> - MntWines 1 436.88 8861.2 3099.3
#> - NumWebVisitsMonth 1 734.35 9158.7 3172.5
summary(model_both)#>
#> Call:
#> lm(formula = NumWebPurchases ~ MntWines + NumDealsPurchases +
#> MntGoldProds + NumStorePurchases + NumWebVisitsMonth + Kidhome +
#> Income + MntSweetProducts + Teenhome + Dt_Customer + Response +
#> MntFruits, data = Store_Clean1)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.3201 -0.9560 -0.1447 0.9101 23.9763
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -8.460004215 3.224269858 -2.624 0.00875 **
#> MntWines 0.001990440 0.000186305 10.684 < 0.0000000000000002 ***
#> NumDealsPurchases 0.221399793 0.026909244 8.228 0.000000000000000323 ***
#> MntGoldProds 0.007781722 0.000942617 8.255 0.000000000000000258 ***
#> NumStorePurchases 0.175033884 0.019354607 9.044 < 0.0000000000000002 ***
#> NumWebVisitsMonth 0.346219548 0.024995291 13.851 < 0.0000000000000002 ***
#> Kidhome1 -0.869893458 0.115772225 -7.514 0.000000000000083007 ***
#> Kidhome2 -0.805239062 0.304634404 -2.643 0.00827 **
#> Income 0.000014175 0.000002368 5.987 0.000000002486904851 ***
#> MntSweetProducts 0.007596427 0.001333106 5.698 0.000000013724060582 ***
#> Teenhome1 0.396645348 0.097814690 4.055 0.000051863498942279 ***
#> Teenhome2 0.539861619 0.286069018 1.887 0.05927 .
#> Dt_Customer 0.000462151 0.000198655 2.326 0.02009 *
#> Response1 0.276195265 0.125384542 2.203 0.02771 *
#> MntFruits 0.002279765 0.001390198 1.640 0.10117
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.956 on 2201 degrees of freedom
#> Multiple R-squared: 0.4938, Adjusted R-squared: 0.4905
#> F-statistic: 153.3 on 14 and 2201 DF, p-value: < 0.00000000000000022
Bandingkan nilai Adjusted R-squared untuk ketiga model:
summary(model_backward)$adj.r.squared#> [1] 0.4909976
summary(model_forward)$adj.r.squared#> [1] 0.4905378
summary(model_both)$adj.r.squared#> [1] 0.4905378
Bandingkan performa kelima model baik dari
model_store_none, model_all_store maupun model
hasil stepwise regression menggunakan fungsi
compare_performance dari packages
performance
library(performance)
comparison <- compare_performance(model_store_none, model_all_store, model_backward, model_forward, model_both)
as.data.frame(comparison)# prediksi biasa
pred_model_step <- predict(object = model_backward,
newdata = Store_Clean1)
head(pred_model_step)#> 1 2 3 4 5 6
#> 5.887996 4.582988 2.730530 2.080234 2.327834 3.881064
atas-bawah
# untuk menambahkan batas atas-bawah
pred_model_step_interval <- predict(
object = model_backward,
newdata = Store_Clean1,
interval = "prediction",
level = 0.95)
head(pred_model_step_interval)#> fit lwr upr
#> 1 5.887996 2.01661236 9.759380
#> 2 4.582988 0.73401653 8.431960
#> 3 2.730530 -1.11145960 6.572519
#> 4 2.080234 -1.76427349 5.924742
#> 5 2.327834 -1.52011205 6.175780
#> 6 3.881064 0.01432816 7.747800
Ilustrasi confidence interval untuk
MntMeatProducts ~ NumWebPurchases
# install.packages("ggplot2")
# Add predictions
pred.int <- predict(model_clean1, interval = "prediction", level = 0.90)
mydata <- cbind(Store_Clean1, pred.int)
ggplot(data = mydata, aes(x = MntMeatProducts, y = NumWebPurchases )) +
geom_point()+
labs(title = "Linear Regression jumlah pembelian melalui situs web perusahaan by Produk Daging") +
geom_line(aes(y = fit), color = "blue") +
geom_line(aes(y = lwr), color = "red", linetype = "dashed") +
geom_line(aes(y = upr), color = "red", linetype = "dashed") +
theme_minimal()
# Asumsi Linear Regression ##Linearity
plot(model_backward, which = 1)
abline(h = 10, col = "green")
abline(h = -10, col = "green")Nilai residual “bounce randomly” di sekitar nilai 0. Kesimpulan: karena garis merah masih ada di dalam cakupan toleransi kita, sehingga model_backward adalah model yang linear.
Normality of Residuals Model linear regression diharapkan menghasilkan error yang berdistribusi normal. Dengan begitu, error lebih banyak berkumpul di sekitar angka nol.
# histogram residual
hist(model_backward$residuals)Uji statistik dengan `shapiro.test()
# shapiro test dari residual
shapiro.test(model_backward$residuals)#>
#> Shapiro-Wilk normality test
#>
#> data: model_backward$residuals
#> W = 0.87072, p-value < 0.00000000000000022
plot(x = model_backward$fitted.values, y = model_backward$residuals)
abline(h = 0, col = "red")library(car)
vif(model_backward)#> GVIF Df GVIF^(1/(2*Df))
#> Income 2.141281 1 1.463311
#> Kidhome 1.896052 2 1.173445
#> Teenhome 1.518718 2 1.110118
#> Dt_Customer 1.244029 1 1.115360
#> MntWines 2.426365 1 1.557679
#> MntFruits 1.936992 1 1.391759
#> MntMeatProducts 2.633190 1 1.622711
#> MntFishProducts 2.080385 1 1.442354
#> MntSweetProducts 1.874324 1 1.369060
#> MntGoldProds 1.420849 1 1.191994
#> NumDealsPurchases 1.586058 1 1.259388
#> NumStorePurchases 2.307276 1 1.518972
#> NumWebVisitsMonth 2.293595 1 1.514462
#> Response 1.176880 1 1.084841
Kesimpulan: Dari uji VIF, prediktor di model_backward lolos uji asumsi multicolinearity (tidak ada nilai VIF > 10)
bptest() dari package lmtest
# bptest dari model
library(lmtest)
bptest(model_backward)#>
#> studentized Breusch-Pagan test
#>
#> data: model_backward
#> BP = 137.21, df = 16, p-value < 0.00000000000000022
Kesimpulan**: karena nilai p-value dari BPtest > 0.05, sehingga gagal tolak H0 (asumsi homoscedasticity terpenuhi).
library(car)
vif(model_backward)#> GVIF Df GVIF^(1/(2*Df))
#> Income 2.141281 1 1.463311
#> Kidhome 1.896052 2 1.173445
#> Teenhome 1.518718 2 1.110118
#> Dt_Customer 1.244029 1 1.115360
#> MntWines 2.426365 1 1.557679
#> MntFruits 1.936992 1 1.391759
#> MntMeatProducts 2.633190 1 1.622711
#> MntFishProducts 2.080385 1 1.442354
#> MntSweetProducts 1.874324 1 1.369060
#> MntGoldProds 1.420849 1 1.191994
#> NumDealsPurchases 1.586058 1 1.259388
#> NumStorePurchases 2.307276 1 1.518972
#> NumWebVisitsMonth 2.293595 1 1.514462
#> Response 1.176880 1 1.084841
Kesimpulan**: Dari uji VIF, prediktor di model_backward lolos uji asumsi multicolinearity (tidak ada nilai VIF > 10)