Package

Beberapa package yang akan dipakai antara lain:

library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(leaps)
library(knitr)

Pembangkitan data

Dengan n = 1000, dilakukan pembangkitan data dengan ketentuan:

Karena k = jumlahan 3 digit terakhir nim -> 0+2+3 = 5, maka:

Pembangkitan data berkorelasi sebagai berikut:

set.seed(1)
sigma12 <- rbind(c(1,0.65), c(0.65,1))
mu12 <- c(2, 2) 

sigma12 <- rbind(c(1,0.65), c(0.65,1))
mu12 <- c(2, 2) 

sigma45 <- rbind(c(1, -0.85), c(-0.85, 1))
mu45 <- c(2,2)

# generate the multivariate normal distribution
df12 <- as.data.frame(mvrnorm(n=1000, mu=mu12, Sigma=sigma12))
df45 <- as.data.frame(mvrnorm(n=1000, mu=mu45, Sigma=sigma45))

Untuk memeriksa apakah korelasinya sudah benar, diperoleh:

cor(df12$V1, df12$V2)
## [1] 0.6471789
cor(df45$V1, df45$V2)
## [1] -0.8479774
names(df12) <- c("x1", "x2")
names(df45) <- c("x4", "x5")

Syntax di atas digunakan untuk mengganti nama kolom hasil bangkitan, yang selanjutnya membangkitkan peubah sisanya yang tidak berkorelasi dan menyimpannya dalam objek data frame bernama df sebagai berikut:

set.seed(1)
df <- cbind(df12, data.frame(x3=rnorm(1000, 2, 1.5)), df45)
for(i in 6:100){
  df[paste0("x", toString(i))] <- rnorm(2, 1.5)
}

Mengganti nilai X11 menjadi X11 = 1.15(X1) + \(\mu\) dan X12 = -0.75X2 + \(\mu\), di mana \(\mu\) ~ N(0, 1.25) sebagai berikut:

set.seed(1)
miu <- rnorm(1000, 0, 1.25)
df$x11 <- 1.15*df$x1 + miu
df$x12 <- -0.75*df$x2 + miu

Membangkitkan nilai parameter \(\beta\), dengan ketentuan \(\beta_0\) = 0.5; \(\beta_1\) = … = \(\beta_25\) = 2.5; \(\beta_26\) = … = \(\beta_100\) = 0 sebagai berikut:

beta <- c(0.5, rep(2.5, 25), rep(0, 75))
length(beta)
## [1] 101

Parameter error/galat, \(\epsilon\) ~ N (0, 1.35) dibangkitkan dengan cara sebagai berikut:

set.seed(1)
eps <- rnorm(1000, 0, 1.35)

Kemudian dicari nilai peubah responnya, dan seluruh peubah Y, X1, .., X100 disimpan dalam objek data frame bernama df yang selanjutnya disimpan dalam file berformat .csv dengan nama “DAT.SIM.01.csv”.

df["y"] <- beta[1]
for(i in 1:100){
  df["y"] <- df["y"] + beta[i+1]*df[paste0("x", toString(i))]
}
df["y"] <- df["y"] + eps
kable(head(df))
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 x29 x30 x31 x32 x33 x34 x35 x36 x37 x38 x39 x40 x41 x42 x43 x44 x45 x46 x47 x48 x49 x50 x51 x52 x53 x54 x55 x56 x57 x58 x59 x60 x61 x62 x63 x64 x65 x66 x67 x68 x69 x70 x71 x72 x73 x74 x75 x76 x77 x78 x79 x80 x81 x82 x83 x84 x85 x86 x87 x88 x89 x90 x91 x92 x93 x94 x95 x96 x97 x98 x99 x100 y
0.9562051 1.9057850 1.0603193 2.6498564 0.9453137 2.634965 0.6292224 1.5693956 2.3108400 0.2532466 0.3165686 -2.2124060 1.850910 0.9085715 0.4027015 1.173510 2.285006 1.7948088 0.4904962 0.1916465 0.9664604 0.7104305 2.377185 1.267536 3.156004 1.970490 0.5220971 3.419770 2.242082 1.985389 1.541999 0.4895349 2.301962 2.7128894 3.211158 -0.8214909 2.6322291 0.0896250 1.2309865 0.685532 2.3555192 1.376397 3.2189263 -0.1043103 2.055737 2.272086 1.893094 1.5235420 2.762009 2.166764 3.281525 1.162309 1.3746908 3.197394 0.7333834 1.7418959 2.989907 1.683584 0.5058755 1.451457 1.573830 1.834980 0.0970941 0.7101996 1.3951479 1.400463 0.7814885 2.7454892 1.2846155 0.8258307 3.0423258 0.6278205 1.679805 0.3014664 2.8663086 2.185512 0.1946041 2.295174 0.5960065 2.3140634 -0.3742053 2.2719040 1.2620845 1.616806 1.4663152 0.7906365 1.605138 -1.7131885 2.262906 0.2916822 1.1244193 1.996613 2.488092 1.170133 0.5189242 3.685438 1.1946911 2.271269 1.912157 3.5737836 76.14073
1.7016479 2.6319568 2.2754650 3.7428884 0.0453573 2.611932 1.7107316 -0.1626489 -0.4123458 2.4981544 2.1864492 -1.7444134 1.325453 0.1659727 3.5361036 2.274005 2.263246 0.2476441 2.2513912 2.0275401 1.1016240 1.2698589 1.953733 2.370005 1.493631 1.778219 0.5734139 2.381278 1.647573 1.651856 1.723422 3.9012221 1.248792 0.8727419 1.105626 2.8641192 0.7256837 -0.3345276 -0.3339286 1.663572 0.6800369 1.754948 0.5414565 -0.3456094 1.439881 1.359161 1.724219 0.8770373 1.094226 1.664639 2.211214 1.490851 -0.5908461 2.563881 1.8820076 0.3672406 1.251753 1.904871 0.4145707 2.076086 2.205946 2.045388 2.1770539 1.0342711 -0.1478511 1.060142 0.9454024 0.2410786 -0.9719617 0.9987028 0.5379819 0.1023704 2.654092 1.0742756 0.8157026 1.889504 2.7168880 1.011797 1.1095816 0.9357507 1.3570953 0.3408821 0.2778067 1.367500 0.8776736 2.3714454 1.313065 0.2243813 1.093185 1.0606773 0.9985576 3.020881 2.746122 2.344345 1.3607786 1.487172 0.9157865 3.606189 1.238735 0.7211698 99.16138
1.6052750 0.8767302 0.7465571 0.0871887 3.2027453 2.634965 0.6292224 1.5693956 2.3108400 0.2532466 0.8015305 -1.7020834 1.850910 0.9085715 0.4027015 1.173510 2.285006 1.7948088 0.4904962 0.1916465 0.9664604 0.7104305 2.377185 1.267536 3.156004 1.970490 0.5220971 3.419770 2.242082 1.985389 1.541999 0.4895349 2.301962 2.7128894 3.211158 -0.8214909 2.6322291 0.0896250 1.2309865 0.685532 2.3555192 1.376397 3.2189263 -0.1043103 2.055737 2.272086 1.893094 1.5235420 2.762009 2.166764 3.281525 1.162309 1.3746908 3.197394 0.7333834 1.7418959 2.989907 1.683584 0.5058755 1.451457 1.573830 1.834980 0.0970941 0.7101996 1.3951479 1.400463 0.7814885 2.7454892 1.2846155 0.8258307 3.0423258 0.6278205 1.679805 0.3014664 2.8663086 2.185512 0.1946041 2.295174 0.5960065 2.3140634 -0.3742053 2.2719040 1.2620845 1.616806 1.4663152 0.7906365 1.605138 -1.7131885 2.262906 0.2916822 1.1244193 1.996613 2.488092 1.170133 0.5189242 3.685438 1.1946911 2.271269 1.912157 3.5737836 75.84910
3.3608304 3.5371411 4.3929212 1.7206457 2.7194813 2.611932 1.7107316 -0.1626489 -0.4123458 2.4981544 5.8590560 -0.6587548 1.325453 0.1659727 3.5361036 2.274005 2.263246 0.2476441 2.2513912 2.0275401 1.1016240 1.2698589 1.953733 2.370005 1.493631 1.778219 0.5734139 2.381278 1.647573 1.651856 1.723422 3.9012221 1.248792 0.8727419 1.105626 2.8641192 0.7256837 -0.3345276 -0.3339286 1.663572 0.6800369 1.754948 0.5414565 -0.3456094 1.439881 1.359161 1.724219 0.8770373 1.094226 1.664639 2.211214 1.490851 -0.5908461 2.563881 1.8820076 0.3672406 1.251753 1.904871 0.4145707 2.076086 2.205946 2.045388 2.1770539 1.0342711 -0.1478511 1.060142 0.9454024 0.2410786 -0.9719617 0.9987028 0.5379819 0.1023704 2.654092 1.0742756 0.8157026 1.889504 2.7168880 1.011797 1.1095816 0.9357507 1.3570953 0.3408821 0.2778067 1.367500 0.8776736 2.3714454 1.313065 0.2243813 1.093185 1.0606773 0.9985576 3.020881 2.746122 2.344345 1.3607786 1.487172 0.9157865 3.606189 1.238735 0.7211698 126.29701
2.2702600 2.3283206 2.4942617 2.4926119 2.3851824 2.634965 0.6292224 1.5693956 2.3108400 0.2532466 3.0226837 -1.3343557 1.850910 0.9085715 0.4027015 1.173510 2.285006 1.7948088 0.4904962 0.1916465 0.9664604 0.7104305 2.377185 1.267536 3.156004 1.970490 0.5220971 3.419770 2.242082 1.985389 1.541999 0.4895349 2.301962 2.7128894 3.211158 -0.8214909 2.6322291 0.0896250 1.2309865 0.685532 2.3555192 1.376397 3.2189263 -0.1043103 2.055737 2.272086 1.893094 1.5235420 2.762009 2.166764 3.281525 1.162309 1.3746908 3.197394 0.7333834 1.7418959 2.989907 1.683584 0.5058755 1.451457 1.573830 1.834980 0.0970941 0.7101996 1.3951479 1.400463 0.7814885 2.7454892 1.2846155 0.8258307 3.0423258 0.6278205 1.679805 0.3014664 2.8663086 2.185512 0.1946041 2.295174 0.5960065 2.3140634 -0.3742053 2.2719040 1.2620845 1.616806 1.4663152 0.7906365 1.605138 -1.7131885 2.262906 0.2916822 1.1244193 1.996613 2.488092 1.170133 0.5189242 3.685438 1.1946911 2.271269 1.912157 3.5737836 97.52459
1.9503085 0.5592367 0.7692974 1.0746257 2.4142117 2.611932 1.7107316 -0.1626489 -0.4123458 2.4981544 1.2172693 -1.4450130 1.325453 0.1659727 3.5361036 2.274005 2.263246 0.2476441 2.2513912 2.0275401 1.1016240 1.2698589 1.953733 2.370005 1.493631 1.778219 0.5734139 2.381278 1.647573 1.651856 1.723422 3.9012221 1.248792 0.8727419 1.105626 2.8641192 0.7256837 -0.3345276 -0.3339286 1.663572 0.6800369 1.754948 0.5414565 -0.3456094 1.439881 1.359161 1.724219 0.8770373 1.094226 1.664639 2.211214 1.490851 -0.5908461 2.563881 1.8820076 0.3672406 1.251753 1.904871 0.4145707 2.076086 2.205946 2.045388 2.1770539 1.0342711 -0.1478511 1.060142 0.9454024 0.2410786 -0.9719617 0.9987028 0.5379819 0.1023704 2.654092 1.0742756 0.8157026 1.889504 2.7168880 1.011797 1.1095816 0.9357507 1.3570953 0.3408821 0.2778067 1.367500 0.8776736 2.3714454 1.313065 0.2243813 1.093185 1.0606773 0.9985576 3.020881 2.746122 2.344345 1.3607786 1.487172 0.9157865 3.606189 1.238735 0.7211698 87.05729

Menyimpan Data

#write_csv(df, "DAT.SIM.01.csv")

Pembagian Data

Data kemudian dibagi/split menjadi 2 bagian, yaitu 80% data latih dan 20% data uji sebagai berikut:

set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.8,0.2))
train  <- df[sample, ]
test   <- df[!sample, ]

Kemudian pada kedua data, akan dilakukan pemilihan model terbaik dengan menggunakan 5 metode: 1. Regresi Linier 2. Best-subset selecyion 3. Ridge Regression 4. LASSO 5. Elastic Net

Regresi Linier

Pembentukan model dilakukan dengan cara:

model_lm <- lm(y~., data=train)

Objek sum_lm menyimpan ringkasan model regresi linier. Selanjutnya, prediksi dengan data uji dilakukan dengan:

predict_lm <- predict(model_lm, test)

Dan diperoleh nilai R Square model ini sebesar 1.

rsq_lm <- R2(predict_lm, test$y)
rsq_lm
## [1] 1

Hal ini terjadi karena data yang digunakan merupakan data simulasi, dan data ini bisa memungkinkan terjadi over-fitting, selain metode LS sendiri juga tidak kekar terhadap overfit.

Kemudian, koefisien regresi linier diperoleh sebagai berikut:

lm_coef <- model_lm$coefficients
kable(lm_coef)
x
(Intercept) 1059.111217
x1 11.622969
x2 6.872969
x3 NA
x4 2.500000
x5 2.500000
x6 -385.648660
x7 NA
x8 NA
x9 NA
x10 NA
x11 NA
x12 NA
x13 NA
x14 NA
x15 NA
x16 NA
x17 NA
x18 NA
x19 NA
x20 NA
x21 NA
x22 NA
x23 NA
x24 NA
x25 NA
x26 NA
x27 NA
x28 NA
x29 NA
x30 NA
x31 NA
x32 NA
x33 NA
x34 NA
x35 NA
x36 NA
x37 NA
x38 NA
x39 NA
x40 NA
x41 NA
x42 NA
x43 NA
x44 NA
x45 NA
x46 NA
x47 NA
x48 NA
x49 NA
x50 NA
x51 NA
x52 NA
x53 NA
x54 NA
x55 NA
x56 NA
x57 NA
x58 NA
x59 NA
x60 NA
x61 NA
x62 NA
x63 NA
x64 NA
x65 NA
x66 NA
x67 NA
x68 NA
x69 NA
x70 NA
x71 NA
x72 NA
x73 NA
x74 NA
x75 NA
x76 NA
x77 NA
x78 NA
x79 NA
x80 NA
x81 NA
x82 NA
x83 NA
x84 NA
x85 NA
x86 NA
x87 NA
x88 NA
x89 NA
x90 NA
x91 NA
x92 NA
x93 NA
x94 NA
x95 NA
x96 NA
x97 NA
x98 NA
x99 NA
x100 NA

Best-Subset Selection

Best-subset selection dilakukan dengan menggunakan validasi silang 10 lipat sebagai berikut:

k = 10
set.seed(1)

folds = sample(1:k, nrow(train), replace = TRUE) # lipatan menggunakan data latih
cv_errors = matrix(NA, k, 100, dimnames = list(NULL, paste(1:100)))

Karena menggunakan function regsubsets yang mana function ini tidak memiliki method predict yang digunakan untuk prediksi, maka akan dilakukan prediksi dengan menambah/memodifikasi method dengan cara:

predict.regsubsets = function(object,newdata,id,...){
      form = as.formula(object$call[[2]])
      mat = model.matrix(form,newdata)
      coefi = coef(object,id=id) 
      xvars = names(coefi)   
      mat[,xvars]%*%coefi
}
mean_cv_errors = apply(cv_errors, 2, mean)
min = which.min(mean_cv_errors) # model dibentuk dengan model lipatan dengan. galat terkecil

# membuat plot error untuk masing-masing model yang terbentuk
plot(mean_cv_errors, type='b')
points(min, mean_cv_errors[min][1], col = "red", cex = 2, pch = 20)

Dari plot di atas, diperoleh berdasrkan validasi silang 10 lipat, bahwa model terbaik adalah yang memiliki prediktor sebanyak 5. Diperoleh koefisien sebagai berikut:

reg_best = regsubsets(y~., data = test, nvmax = 5, really.big = T)
## Reordering variables and trying again:
bs_coef <- coef(reg_best, 5)
kable(bs_coef)
x
(Intercept) 1092.4017690
x1 11.6434360
x2 6.6627637
x5 0.3084692
x6 -394.6256878
x21 0.0000000
bs_sum <- summary(reg_best)
cat("RSS Best Subset:", bs_sum$rss); print("")
## RSS Best Subset: 4997.053 803.7508 380.7993 337.0469 -1.364242e-12
## [1] ""
cat("R2 Adj Best Subset:", bs_sum$adjr2)
## R2 Adj Best Subset: 0.9267663 0.9881603 0.9943617 0.9949837 1
plot(bs_sum$rss, xlab='No. of Variables', ylab='RSS',type='l'); points(which.min(bs_sum$rss), bs_sum$rss[which.min(bs_sum$rss)],pch=19,col='red')

plot(bs_sum$adjr2, xlab='No. of Variables', ylab='Adj R2',type='l'); points(which.max(bs_sum$adjr2), bs_sum$adjr2[which.max(bs_sum$adjr2)],pch=19,col='red')

Hal ini selaras dengan data uji yang dimodelkan dengan peubah prediktor sebanyak 5, bahwa model yang terbentuk memiliki galat terkecil dengan R-Square 1.

rsq_bs <- bs_sum$adjr2[5]
rsq_bs
## [1] 1

Ridge Regression

Ridge regression dibentuk pula dengan menggunakan validasi silang sebagai berikut:

ctrlspecs <- trainControl(method="cv", 
                          number=10,
                          savePredictions="all")

Tuning parameter lambda dibentuk dengan menggunakan bilangan antara 0.00001 hingga 100000 sebagai berikut:

lambda_v <- 10^seq(5, -5, length=500)

Model Ridge dibentuk dengan parameter alpha = 0 dan diperoleh nilai lambda terbaik sebesar 1.7.

set.seed(1)
rg_model <- train(y ~ ., 
                data=train,
                preProcess=c("center","scale"),
                method="glmnet", 
                tuneGrid=expand.grid(alpha=0, lambda=lambda_v),
                trControl=ctrlspecs,
                na.action=na.omit)
kable(rg_model$bestTune)
alpha lambda
262 0 1.700047

Dan berikut merupakan koefisien dari ridge regression.

rg_coef <- as.vector(coef(rg_model$finalModel, rg_model$bestTune$lambda))
kable(rg_coef)
x
94.4604767
3.3744737
3.8164987
3.9476767
1.5724895
1.5896071
-0.0467993
0.0470949
-0.0472370
-0.0473332
0.0474111
3.7379309
3.1764261
-0.0476868
-0.0477034
0.0477412
0.0477737
-0.0478016
-0.0478271
0.0478510
0.0478731
0.0478932
0.0479111
-0.0479270
0.0479408
-0.0479529
-0.0479634
0.0479725
-0.0479803
-0.0479871
-0.0479931
0.0479984
0.0480032
-0.0480076
-0.0480119
-0.0480161
0.0480202
-0.0480243
-0.0480284
-0.0480324
0.0480361
-0.0480395
0.0480424
-0.0480447
-0.0480463
-0.0480470
-0.0480468
-0.0480457
-0.0480437
-0.0480408
-0.0480371
-0.0480326
0.0480274
-0.0480217
-0.0480156
0.0480091
-0.0480025
-0.0479957
0.0479890
-0.0479823
0.0479758
0.0479696
0.0479636
0.0479579
0.0479526
-0.0479476
-0.0479430
0.0479387
-0.0479346
-0.0479308
0.0479272
-0.0479237
-0.0479203
0.0479168
0.0479132
-0.0479096
-0.0479057
0.0479015
-0.0478971
0.0478924
-0.0478873
0.0478818
-0.0478760
-0.0478699
-0.0478634
-0.0478565
0.0478494
-0.0478419
0.0478342
-0.0478263
0.0478182
-0.0478099
0.0478014
0.0477928
0.0477841
0.0477753
-0.0477665
-0.0477575
0.0477485
-0.0477395
-0.0477304
plot(log(rg_model$results$lambda),
     rg_model$results$RMSE,
     xlab="log(lambda)",
     ylab="Root Mean-Squared Error (RMSE)"
     )

rsq_ridge <- function(x, y, z){
  temp <- data.frame(x, y)
  colnames(temp) <- c("Rsquared", "lambda_val")
  rownum <- which(temp$lambda_val==z)
  print(temp[rownum,]$Rsquared)
}
rsq_ridge(x=rg_model$results$Rsquared,
               y=rg_model$results$lambda,
               z=rg_model$bestTune$lambda)
## [1] 0.9990451

Ridge regression ini menghasilkan koefisien R-Squared sebesar 99% untuk data latih.

rg_pred <- predict(rg_model, newdata=test)
rsq_rg <- R2(rg_pred, test$y)
rsq_rg
## [1] 0.9991881

Ridge regression ini menghasilkan koefisien R-Squared sebesar 99% untuk data uji namun nilainya lebih besar dari R-Squared data latih.

Lasso Regression

Pembentukan model regresi LASSO juga menggunakan validasi silang 10 lipat, sebagai berikut:

ctrlspecs <- trainControl(method="cv", 
                          number=10,
                          savePredictions="all")

dengan nilai yang sama dengan Ridge Regression, parameter lambda pada LASSO dibentuk dari nilai 0.00001 hingga 100000 dengan parameter alpha = 1 sebagai berikut:

lambda_v <- 10^seq(5, -5, length=500)
set.seed(1)
ls_model <- train(y ~ ., 
                data=train,
                preProcess=c("center","scale"),
                method="glmnet", 
                tuneGrid=expand.grid(alpha=1, lambda=lambda_v),
                trControl=ctrlspecs,
                na.action=na.omit)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
kable(ls_model$bestTune)
alpha lambda
207 1 0.1343558

Diperoleh nilai lambda optimum sebesar 0.134.

ls_coef <- as.vector(coef(ls_model$finalModel, ls_model$bestTune$lambda))
kable(ls_coef)
x
94.4604767
0.0000000
0.0000000
7.5159390
1.6806389
1.6870959
-4.3136900
0.0000000
0.0000000
0.0000000
0.0000000
9.8447784
0.0000000
-0.0017206
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
0.0000000
plot(log(ls_model$results$lambda),
     ls_model$results$RMSE,
     xlab="log(lambda)",
     ylab="Root Mean-Squared Error (RMSE)"
     )

rsq_lasso <- function(x, y, z){
  temp <- data.frame(x, y)
  colnames(temp) <- c("Rsquared", "lambda_val")
  rownum <- which(temp$lambda_val==z)
  print(temp[rownum,]$Rsquared)
}
rsq_lasso(x=ls_model$results$Rsquared,
               y=ls_model$results$lambda,
               z=ls_model$bestTune$lambda)
## [1] 0.9992226

Model dengan data latih LASSO memberikan hasil RSquared sebesar 99.922%.

ls_pred <- predict(ls_model, newdata=test)
rsq_ls <- R2(ls_pred, test$y)
rsq_ls
## [1] 0.999281

Sedangkan prediksi dengan data uji LASSO memberikan nilai R Squared sebesar 99.928%

Elastic Net

Dengan cara yang sama, model Elastic Net dibentuk dengan menggunakan validasi silang 10 lipat sebagai berikut:

ctrlspecs <- trainControl(method="cv", 
                          number=10,
                          savePredictions="all")

Elastic net memiliki nilai alpha antara 0 dan 1, sehingga nilai alpha dicari dengan dilakukan tuning parameter dari 0.1 hingga 0.9 dengan nilai lambda yang sama sebagaimana pada Ridge dan LASSO.

lambda_v <- 10^seq(5, -5, length=500)
alpha_v <- seq(0.1, 0.9, 0.1)
set.seed(1)
en_model <- train(y ~ ., 
                data=train,
                preProcess=c("center","scale"),
                method="glmnet", 
                tuneGrid=expand.grid(alpha=alpha_v, lambda=lambda_v),
                trControl=ctrlspecs,
                na.action=na.omit)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
kable(en_model$bestTune)
alpha lambda
242 0.1 0.6755528
en_coef <- as.vector(coef(rg_model$finalModel, rg_model$bestTune$lambda))
kable(en_coef)
x
94.4604767
3.3744737
3.8164987
3.9476767
1.5724895
1.5896071
-0.0467993
0.0470949
-0.0472370
-0.0473332
0.0474111
3.7379309
3.1764261
-0.0476868
-0.0477034
0.0477412
0.0477737
-0.0478016
-0.0478271
0.0478510
0.0478731
0.0478932
0.0479111
-0.0479270
0.0479408
-0.0479529
-0.0479634
0.0479725
-0.0479803
-0.0479871
-0.0479931
0.0479984
0.0480032
-0.0480076
-0.0480119
-0.0480161
0.0480202
-0.0480243
-0.0480284
-0.0480324
0.0480361
-0.0480395
0.0480424
-0.0480447
-0.0480463
-0.0480470
-0.0480468
-0.0480457
-0.0480437
-0.0480408
-0.0480371
-0.0480326
0.0480274
-0.0480217
-0.0480156
0.0480091
-0.0480025
-0.0479957
0.0479890
-0.0479823
0.0479758
0.0479696
0.0479636
0.0479579
0.0479526
-0.0479476
-0.0479430
0.0479387
-0.0479346
-0.0479308
0.0479272
-0.0479237
-0.0479203
0.0479168
0.0479132
-0.0479096
-0.0479057
0.0479015
-0.0478971
0.0478924
-0.0478873
0.0478818
-0.0478760
-0.0478699
-0.0478634
-0.0478565
0.0478494
-0.0478419
0.0478342
-0.0478263
0.0478182
-0.0478099
0.0478014
0.0477928
0.0477841
0.0477753
-0.0477665
-0.0477575
0.0477485
-0.0477395
-0.0477304
plot(log(en_model$results$lambda),
     en_model$results$RMSE,
     xlab="log(lambda)",
     ylab="Root Mean-Squared Error (RMSE)",
     col=factor(en_model$results$alpha)
     );

rsq_elnet <- function(x, y, z){
  temp <- data.frame(x, y)
  colnames(temp) <- c("Rsquared", "lambda_val")
  rownum <- which(temp$lambda_val==z)
  print(temp[rownum,]$Rsquared)
}
rsq_elnet(x=en_model$results$Rsquared,
               y=en_model$results$lambda,
               z=en_model$bestTune$lambda)
## [1] 0.9993004 0.9987293 0.9978274 0.9966181 0.9950506 0.9936096 0.9934032
## [8] 0.9932845 0.9931450
en_pred <- predict(en_model, newdata=test)
rsq_en <- R2(en_pred, test$y)
rsq_en
## [1] 0.9993807

Diperoleh nilai R Squared Elastic Net untuk data uji sebesar 99.938%.

Hasil

coefs <- data.frame(cbind(linear = lm_coef,
                    best_subset = bs_coef,
                    ridge = rg_coef,
                    lasso = ls_coef,
                    elastic_net = en_coef))
## Warning in cbind(linear = lm_coef, best_subset = bs_coef, ridge = rg_coef, :
## number of rows of result is not a multiple of vector length (arg 2)
kable(coefs)
linear best_subset ridge lasso elastic_net
(Intercept) 1059.111217 1092.4017690 94.4604767 94.4604767 94.4604767
x1 11.622969 11.6434360 3.3744737 0.0000000 3.3744737
x2 6.872969 6.6627637 3.8164987 0.0000000 3.8164987
x3 NA 0.3084692 3.9476767 7.5159390 3.9476767
x4 2.500000 -394.6256878 1.5724895 1.6806389 1.5724895
x5 2.500000 0.0000000 1.5896071 1.6870959 1.5896071
x6 -385.648660 1092.4017690 -0.0467993 -4.3136900 -0.0467993
x7 NA 11.6434360 0.0470949 0.0000000 0.0470949
x8 NA 6.6627637 -0.0472370 0.0000000 -0.0472370
x9 NA 0.3084692 -0.0473332 0.0000000 -0.0473332
x10 NA -394.6256878 0.0474111 0.0000000 0.0474111
x11 NA 0.0000000 3.7379309 9.8447784 3.7379309
x12 NA 1092.4017690 3.1764261 0.0000000 3.1764261
x13 NA 11.6434360 -0.0476868 -0.0017206 -0.0476868
x14 NA 6.6627637 -0.0477034 0.0000000 -0.0477034
x15 NA 0.3084692 0.0477412 0.0000000 0.0477412
x16 NA -394.6256878 0.0477737 0.0000000 0.0477737
x17 NA 0.0000000 -0.0478016 0.0000000 -0.0478016
x18 NA 1092.4017690 -0.0478271 0.0000000 -0.0478271
x19 NA 11.6434360 0.0478510 0.0000000 0.0478510
x20 NA 6.6627637 0.0478731 0.0000000 0.0478731
x21 NA 0.3084692 0.0478932 0.0000000 0.0478932
x22 NA -394.6256878 0.0479111 0.0000000 0.0479111
x23 NA 0.0000000 -0.0479270 0.0000000 -0.0479270
x24 NA 1092.4017690 0.0479408 0.0000000 0.0479408
x25 NA 11.6434360 -0.0479529 0.0000000 -0.0479529
x26 NA 6.6627637 -0.0479634 0.0000000 -0.0479634
x27 NA 0.3084692 0.0479725 0.0000000 0.0479725
x28 NA -394.6256878 -0.0479803 0.0000000 -0.0479803
x29 NA 0.0000000 -0.0479871 0.0000000 -0.0479871
x30 NA 1092.4017690 -0.0479931 0.0000000 -0.0479931
x31 NA 11.6434360 0.0479984 0.0000000 0.0479984
x32 NA 6.6627637 0.0480032 0.0000000 0.0480032
x33 NA 0.3084692 -0.0480076 0.0000000 -0.0480076
x34 NA -394.6256878 -0.0480119 0.0000000 -0.0480119
x35 NA 0.0000000 -0.0480161 0.0000000 -0.0480161
x36 NA 1092.4017690 0.0480202 0.0000000 0.0480202
x37 NA 11.6434360 -0.0480243 0.0000000 -0.0480243
x38 NA 6.6627637 -0.0480284 0.0000000 -0.0480284
x39 NA 0.3084692 -0.0480324 0.0000000 -0.0480324
x40 NA -394.6256878 0.0480361 0.0000000 0.0480361
x41 NA 0.0000000 -0.0480395 0.0000000 -0.0480395
x42 NA 1092.4017690 0.0480424 0.0000000 0.0480424
x43 NA 11.6434360 -0.0480447 0.0000000 -0.0480447
x44 NA 6.6627637 -0.0480463 0.0000000 -0.0480463
x45 NA 0.3084692 -0.0480470 0.0000000 -0.0480470
x46 NA -394.6256878 -0.0480468 0.0000000 -0.0480468
x47 NA 0.0000000 -0.0480457 0.0000000 -0.0480457
x48 NA 1092.4017690 -0.0480437 0.0000000 -0.0480437
x49 NA 11.6434360 -0.0480408 0.0000000 -0.0480408
x50 NA 6.6627637 -0.0480371 0.0000000 -0.0480371
x51 NA 0.3084692 -0.0480326 0.0000000 -0.0480326
x52 NA -394.6256878 0.0480274 0.0000000 0.0480274
x53 NA 0.0000000 -0.0480217 0.0000000 -0.0480217
x54 NA 1092.4017690 -0.0480156 0.0000000 -0.0480156
x55 NA 11.6434360 0.0480091 0.0000000 0.0480091
x56 NA 6.6627637 -0.0480025 0.0000000 -0.0480025
x57 NA 0.3084692 -0.0479957 0.0000000 -0.0479957
x58 NA -394.6256878 0.0479890 0.0000000 0.0479890
x59 NA 0.0000000 -0.0479823 0.0000000 -0.0479823
x60 NA 1092.4017690 0.0479758 0.0000000 0.0479758
x61 NA 11.6434360 0.0479696 0.0000000 0.0479696
x62 NA 6.6627637 0.0479636 0.0000000 0.0479636
x63 NA 0.3084692 0.0479579 0.0000000 0.0479579
x64 NA -394.6256878 0.0479526 0.0000000 0.0479526
x65 NA 0.0000000 -0.0479476 0.0000000 -0.0479476
x66 NA 1092.4017690 -0.0479430 0.0000000 -0.0479430
x67 NA 11.6434360 0.0479387 0.0000000 0.0479387
x68 NA 6.6627637 -0.0479346 0.0000000 -0.0479346
x69 NA 0.3084692 -0.0479308 0.0000000 -0.0479308
x70 NA -394.6256878 0.0479272 0.0000000 0.0479272
x71 NA 0.0000000 -0.0479237 0.0000000 -0.0479237
x72 NA 1092.4017690 -0.0479203 0.0000000 -0.0479203
x73 NA 11.6434360 0.0479168 0.0000000 0.0479168
x74 NA 6.6627637 0.0479132 0.0000000 0.0479132
x75 NA 0.3084692 -0.0479096 0.0000000 -0.0479096
x76 NA -394.6256878 -0.0479057 0.0000000 -0.0479057
x77 NA 0.0000000 0.0479015 0.0000000 0.0479015
x78 NA 1092.4017690 -0.0478971 0.0000000 -0.0478971
x79 NA 11.6434360 0.0478924 0.0000000 0.0478924
x80 NA 6.6627637 -0.0478873 0.0000000 -0.0478873
x81 NA 0.3084692 0.0478818 0.0000000 0.0478818
x82 NA -394.6256878 -0.0478760 0.0000000 -0.0478760
x83 NA 0.0000000 -0.0478699 0.0000000 -0.0478699
x84 NA 1092.4017690 -0.0478634 0.0000000 -0.0478634
x85 NA 11.6434360 -0.0478565 0.0000000 -0.0478565
x86 NA 6.6627637 0.0478494 0.0000000 0.0478494
x87 NA 0.3084692 -0.0478419 0.0000000 -0.0478419
x88 NA -394.6256878 0.0478342 0.0000000 0.0478342
x89 NA 0.0000000 -0.0478263 0.0000000 -0.0478263
x90 NA 1092.4017690 0.0478182 0.0000000 0.0478182
x91 NA 11.6434360 -0.0478099 0.0000000 -0.0478099
x92 NA 6.6627637 0.0478014 0.0000000 0.0478014
x93 NA 0.3084692 0.0477928 0.0000000 0.0477928
x94 NA -394.6256878 0.0477841 0.0000000 0.0477841
x95 NA 0.0000000 0.0477753 0.0000000 0.0477753
x96 NA 1092.4017690 -0.0477665 0.0000000 -0.0477665
x97 NA 11.6434360 -0.0477575 0.0000000 -0.0477575
x98 NA 6.6627637 0.0477485 0.0000000 0.0477485
x99 NA 0.3084692 -0.0477395 0.0000000 -0.0477395
x100 NA -394.6256878 -0.0477304 0.0000000 -0.0477304

Dari kelima model, diperoleh bahwa tidak semua koefisien diperoleh pada model regresi linier dengan metode Least Square, sedangkan pada best-subset, semua keofisien muncul kecuali yang memiliki nilai 0.000 merupakan koefisien yang bisa dibuang. Pada Ridge Regression dan Elastic Net, semua koefisien memiliki nilai, sedangkan pada LASSO hanya beberapa koefisien yang memiliki nilai, selebihnya bisa dibuang karena bernilai 0.

Setiap model dugaan memiliki nilai R-Suared sebagai berikut:

rsq <- data.frame(method = c("linear", "best subset", "ridge", "lasso", "elastic net"),
                  rsq = c(rsq_lm, rsq_bs, rsq_rg, rsq_ls, rsq_en))
kable(rsq)
method rsq
linear 1.0000000
best subset 1.0000000
ridge 0.9991881
lasso 0.9992810
elastic net 0.9993807
plot(rsq$rsq, xlab='Method', ylab='Adj R2',type='l'); points(which.max(rsq$rsq), rsq$rsq[which.max(rsq$rsq)],pch=19,col='red')

Nilai R-Squared tertinggi dimiliki oleh model Regresi Linier dan Best-Subset, namun pada kenyataannya model ini cukup rumit karena menyertakan semua peubah prediktor yang tersedia. Sehingga, disimpulkan bahwa model terbaik merupakan model LASSO karena memiliki R-Squared yang cukup tinggi, selain modelnya tidak rumit dan interpretable.