set.seed(1582)
x1 <- rnorm(1000, 2.5, sqrt(1.5))
x2 <- rnorm(1000, 2.5, sqrt(1.5))
x3 <- rnorm(1000, 2.5, sqrt(1.5))
x4 <- rnorm(1000, 2.5, sqrt(1.5))
x5 <- rnorm(1000, 2.5, sqrt(1.5))
x6 <- rnorm(1000, 2.5, sqrt(1.5))
x7 <- rnorm(1000, 2.5, sqrt(1.5))
x8 <- rnorm(1000, 2.5, sqrt(1.5))
x9 <- rnorm(1000, 2.5, sqrt(1.5))
x10 <- rnorm(1000, 2.5, sqrt(1.5))
set.seed(1582)
a <- rnorm(1000, 0, sqrt(1.2))
x11 <- 1.25*x1+a
b0 <- 1
b1 <- 3; b2 <- 3; b3 <- 3; b4 <- 3; b5 <- 3
b6 <- 0; b7 <- 0; b8 <- 0; b9 <- 0; b10 <- 0
b11 <- 5
set.seed(1582)
e <- rnorm(1000, 0, sqrt(1.3))
data01 <- data.frame(x1=x1,
x2=x2,
x3=x3,
x4=x4,
x5=x5,
x6=x6,
x7=x7,
x8=x8,
x9=x9,
x10=x10,
x11=x11,
y=b0+b1*x1+b2*x2+b3*x3+b4*x4+b5*x5+
b6*x6+b7*x7+b8*x8+b9*x9+b10*x10+b11*x11+e)
head(data01)
## x1 x2 x3 x4 x5 x6 x7
## 1 2.693285 1.762560 2.3535134 3.8779682 1.8569311 1.1371476 0.2621351
## 2 2.264971 3.160536 0.8569413 1.4502991 4.6089542 2.3854111 1.6384203
## 3 1.048398 1.541537 3.2408019 2.8003274 5.7362363 5.4866556 2.0765968
## 4 2.628367 3.264328 0.9171354 2.3839689 2.7714423 0.6897054 1.9180326
## 5 3.133178 1.571463 1.7678898 -0.2749999 -0.1815739 3.1068927 4.5009249
## 6 4.180858 1.707175 2.6665232 1.9081290 1.8204561 2.4488011 1.7186883
## x8 x9 x10 x11 y
## 1 1.869777 2.3550704 2.3031984 3.53948503 56.51013
## 2 3.611768 2.3460464 2.0787794 2.62099660 50.91129
## 3 3.105945 0.6223807 0.6351724 0.01214546 42.81126
## 4 3.458304 3.1707753 2.8823807 3.40027367 54.01660
## 5 2.899300 1.8705880 1.2974981 4.48280463 42.05135
## 6 1.244865 3.8367066 1.7024299 6.72947753 73.06160
write.csv(data01, "DATA01.csv")
set.seed(1582)
sample <- sample(c(TRUE, FALSE), nrow(data01), replace=TRUE, prob=c(0.8,0.2))
train <- data01[sample, ]
test <- data01[!sample, ]
ls <- lm(y~.,data = train)
summary(ls)
##
## Call:
## lm(formula = y ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.619e-12 -4.550e-15 1.560e-15 7.390e-15 5.945e-13
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.251e+01 1.525e-14 -8.203e+14 <2e-16 ***
## x1 1.465e+01 1.859e-15 7.884e+15 <2e-16 ***
## x2 3.000e+00 1.814e-15 1.654e+15 <2e-16 ***
## x3 3.000e+00 1.830e-15 1.640e+15 <2e-16 ***
## x4 3.000e+00 1.775e-15 1.690e+15 <2e-16 ***
## x5 3.000e+00 1.837e-15 1.633e+15 <2e-16 ***
## x6 1.787e-15 1.889e-15 9.460e-01 0.345
## x7 2.825e-15 1.818e-15 1.554e+00 0.121
## x8 1.904e-15 1.894e-15 1.005e+00 0.315
## x9 -8.678e-17 1.762e-15 -4.900e-02 0.961
## x10 5.648e-19 1.832e-15 0.000e+00 1.000
## x11 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.248e-14 on 783 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.197e+30 on 10 and 783 DF, p-value: < 2.2e-16
pred_ls <- predict(ls, test)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
rmse_ls <- RMSE(pred_ls, test$y)
rmse_ls
## [1] 8.591058e-14
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
fit_ridge <- cv.glmnet(as.matrix(train[,-12]), train$y,
family = "gaussian",
alpha=0,
nfolds=10)
plot(fit_ridge)
pred_ridge <- predict(fit_ridge, as.matrix(test[,-12]))
rmse_ridge <- RMSE(pred_ridge, test$y)
rmse_ridge
## [1] 0.9977031
library(glmnet)
fit_lass <- cv.glmnet(as.matrix(train[,-12]), train$y,
family = "gaussian",
alpha=1,
nfolds = 10)
plot(fit_lass)
pred_lass <- predict(fit_lass, as.matrix(test[,-12]))
rmse_lass <- RMSE(pred_lass, test$y)
rmse_lass
## [1] 0.5491951
cv_10 = trainControl(method = "cv", number = 10)
hit_elnet = train(
y ~ ., data = train,
method = "glmnet",
trControl = cv_10
)
hit_elnet
## glmnet
##
## 794 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 714, 716, 715, 715, 714, 715, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.03455158 0.5801612 0.9998368 0.4623268
## 0.10 0.34551584 0.5801612 0.9998368 0.4623268
## 0.10 3.45515842 2.3649312 0.9971458 1.8834607
## 0.55 0.03455158 0.5831204 0.9996499 0.4649532
## 0.55 0.34551584 0.5831204 0.9996499 0.4649532
## 0.55 3.45515842 4.8395364 0.9659674 3.8560493
## 1.00 0.03455158 0.5741954 0.9995935 0.4585698
## 1.00 0.34551584 0.7794088 0.9992425 0.6219733
## 1.00 3.45515842 7.7632618 0.8687375 6.1962958
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.03455158.
library(glmnet)
fit_elnet <- cv.glmnet(as.matrix(train[,-12]), train$y,
family = "gaussian",
alpha=0.1,
nfolds = 10)
plot(fit_elnet)
pred_elnet <- predict(fit_elnet, as.matrix(test[,-12]))
rmse_elnet <- RMSE(pred_elnet, test$y)
rmse_elnet
## [1] 0.5639554
res <- data.frame(Method = c("LS", "Ridge", "Lasso", "Elastic Net"),
RMSE = c(rmse_ls, rmse_ridge, rmse_lass, rmse_elnet))
res
## Method RMSE
## 1 LS 8.591058e-14
## 2 Ridge 9.977031e-01
## 3 Lasso 5.491951e-01
## 4 Elastic Net 5.639554e-01
Diperoleh bahwa nilai terendah RMSE dari keempat model adalah model Linear Regression, disusul dengan model Lasso, kemudian Elastic Net, dan Ridge. Hal ini masuk akal mengingat ketiga model lain lebih cocok digunakan pada data dengan prediktor yang sangat besar.
coefs <- data.frame(LS = coef(ls),
Ridge = as.array(coef(fit_ridge)),
Lasso = as.array(coef(fit_lass)),
ElNet = as.array(coef(fit_elnet)))
names(coefs) = c("LS", "Ridge", "Lasso", "Elastic Net")
coefs
## LS Ridge Lasso Elastic Net
## (Intercept) -1.250771e+01 -0.667577922 -9.734426644 -2.493581
## x1 1.465309e+01 7.013600139 14.400066916 7.209136
## x2 3.000000e+00 2.712300407 2.776112712 2.813031
## x3 3.000000e+00 2.741926487 2.788804422 2.829085
## x4 3.000000e+00 2.739853100 2.789228145 2.828149
## x5 3.000000e+00 2.766033458 2.791921968 2.839879
## x6 1.786934e-15 -0.022038287 0.000000000 0.000000
## x7 2.825401e-15 -0.008324225 0.000000000 0.000000
## x8 1.904124e-15 -0.012092336 0.000000000 0.000000
## x9 -8.677660e-17 -0.041598585 0.000000000 0.000000
## x10 5.647635e-19 -0.023815331 0.000000000 0.000000
## x11 NA 3.252148164 0.007657115 3.309891
Dan di atas merupakan perbandingan koefisien dari ketiga model