Nama anggota kelompok 4:
1. M. Syarlan
(F1F023005)
2. A’inun Fauziyah (F1F023017)
3. Zahra Hana Andrea
(F1F023027)
4. Chesa Nabilla Azzahra (F1F023040)
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read_excel(
"D:/SEMESETER 6/KAPITA SELEKTA/house_prices_bridge.xlsx"
)
head(data)
## # A tibble: 6 × 10
## SalePrice GrLivArea X1stFlrSF X2ndFlrSF TotalBsmtSF GarageArea GarageCars
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 208500 1710 856 854 856 548 2
## 2 181500 1262 1262 0 1262 460 2
## 3 223500 1786 920 866 920 608 2
## 4 140000 1717 961 756 756 642 3
## 5 250000 2198 1145 1053 1145 836 3
## 6 143000 1362 796 566 796 480 2
## # ℹ 3 more variables: OverallQual <dbl>, YearBuilt <dbl>, YearRemodAdd <dbl>
X <- as.matrix(data[, c(
"GrLivArea", "X1stFlrSF", "X2ndFlrSF",
"TotalBsmtSF", "GarageArea", "GarageCars",
"OverallQual", "YearBuilt", "YearRemodAdd"
)])
Y <- data$SalePrice
n <- length(Y)
# Standardisasi
X_std <- scale(X)
X_design <- cbind(Intercept = 1, X_std)
model_ols <- lm(SalePrice ~ GrLivArea + X1stFlrSF + X2ndFlrSF +
TotalBsmtSF + GarageArea + GarageCars +
OverallQual + YearBuilt + YearRemodAdd,
data = data)
yhat_ols <- predict(model_ols)
coef_ols <- coef(model_ols)
max(vif(model_ols))
## [1] 122.6089
vif(model_ols)
## GrLivArea X1stFlrSF X2ndFlrSF TotalBsmtSF GarageArea GarageCars
## 122.608908 70.202223 87.143487 3.545633 4.855265 5.178548
## OverallQual YearBuilt YearRemodAdd
## 2.736800 2.159474 1.730681
bridge_objective <- function(beta, X, y, lambda, gamma){
residual <- y - X %*% beta
SSE <- sum(residual^2)
penalty <- lambda * sum(abs(beta[-1])^gamma)
return(SSE + penalty)
}
bridge_cv <- function(X, y, lambda, gamma, K = 5){
folds <- createFolds(y, k = K, list = TRUE)
cv_error <- rep(0, K)
for(i in 1:K){
test_idx <- folds[[i]]
train_idx <- setdiff(1:length(y), test_idx)
fit <- optim(
par = rep(0, ncol(X)),
fn = bridge_objective,
X = X[train_idx, ],
y = y[train_idx],
lambda = lambda,
gamma = gamma,
method = "BFGS"
)
beta_hat <- fit$par
y_pred <- X[test_idx, ] %*% beta_hat
cv_error[i] <- mean((y[test_idx] - y_pred)^2)
}
return(mean(cv_error))
}
hitung_vif_post <- function(beta, X_std, y){
vars <- colnames(X_std)[abs(beta[-1]) > 1e-6]
if(length(vars) < 2){
return(NA)
}
df <- as.data.frame(X_std)
colnames(df) <- colnames(X_std)
df$Y <- y
model_subset <- lm(
as.formula(paste("Y ~", paste(vars, collapse="+"))),
data=df
)
return(max(vif(model_subset)))
}
gamma_grid <- c(0.5, 1, 1.5, 2)
results <- data.frame()
coef_list <- list()
pred_list <- list()
############################
# OLS
############################
SSE_ols <- sum((Y - yhat_ols)^2)
df_ols <- length(coef_ols)
results <- rbind(results,
data.frame(
Model="OLS",
Lambda=0,
MSE=mean((Y-yhat_ols)^2),
RMSE=sqrt(mean((Y-yhat_ols)^2)),
R2=summary(model_ols)$r.squared,
R2_adj=summary(model_ols)$adj.r.squared,
AIC=AIC(model_ols),
BIC=BIC(model_ols),
VIF=max(vif(model_ols))
))
coef_list[["OLS"]] <- coef_ols
pred_list[["OLS"]] <- yhat_ols
############################
# BRIDGE
############################
for(g in gamma_grid){
cat("Proses Bridge gamma =", g, "\n")
lambda_grid <- seq(0.01,100,length=25)
cv_vals <- sapply(lambda_grid,
function(lam) bridge_cv(X_design,Y,lam,g))
best_lambda <- lambda_grid[which.min(cv_vals)]
fit <- optim(
par = rep(0,ncol(X_design)),
fn = bridge_objective,
X = X_design,
y = Y,
lambda = best_lambda,
gamma = g,
method="BFGS"
)
beta_hat <- fit$par
yhat <- X_design %*% beta_hat
SSE <- sum((Y-yhat)^2)
df_eff <- sum(abs(beta_hat[-1])>1e-6) + 1
AIC_pen <- n*log(SSE/n) + 2*df_eff
BIC_pen <- n*log(SSE/n) + log(n)*df_eff
r2 <- 1 - SSE/sum((Y-mean(Y))^2)
r2_adj <- 1 - ((1-r2)*(n-1)/(n-df_eff))
vif_post <- hitung_vif_post(beta_hat, X_std, Y)
results <- rbind(results,
data.frame(
Model=paste0("Bridge γ=",g),
Lambda=round(best_lambda,4),
MSE=mean((Y-yhat)^2),
RMSE=sqrt(mean((Y-yhat)^2)),
R2=r2,
R2_adj=r2_adj,
AIC=AIC_pen,
BIC=BIC_pen,
VIF=vif_post
))
coef_list[[paste0("Bridge_",g)]] <- beta_hat
pred_list[[paste0("Bridge γ=",g)]] <- yhat
}
## Proses Bridge gamma = 0.5
## Proses Bridge gamma = 1
## Proses Bridge gamma = 1.5
## Proses Bridge gamma = 2
############################
# RIDGE
############################
cv_ridge <- cv.glmnet(X_std,Y,alpha=0)
lambda_ridge <- cv_ridge$lambda.min
ridge_pred <- predict(cv_ridge,X_std,s="lambda.min")
coef_ridge <- as.vector(coef(cv_ridge,s="lambda.min"))
SSE_ridge <- sum((Y-ridge_pred)^2)
df_ridge <- sum(abs(coef_ridge[-1])>1e-6)+1
AIC_ridge <- n*log(SSE_ridge/n)+2*df_ridge
BIC_ridge <- n*log(SSE_ridge/n)+log(n)*df_ridge
r2_ridge <- 1-SSE_ridge/sum((Y-mean(Y))^2)
r2adj_ridge <- 1-((1-r2_ridge)*(n-1)/(n-df_ridge))
results <- rbind(results,
data.frame(
Model="Ridge",
Lambda=round(lambda_ridge,4),
MSE=mean((Y-ridge_pred)^2),
RMSE=sqrt(mean((Y-ridge_pred)^2)),
R2=r2_ridge,
R2_adj=r2adj_ridge,
AIC=AIC_ridge,
BIC=BIC_ridge,
VIF=hitung_vif_post(coef_ridge,X_std,Y)
))
coef_list[["Ridge"]] <- coef_ridge
pred_list[["Ridge"]] <- ridge_pred
############################
# LASSO
############################
cv_lasso <- cv.glmnet(X_std,Y,alpha=1)
lambda_lasso <- cv_lasso$lambda.min
lasso_pred <- predict(cv_lasso,X_std,s="lambda.min")
coef_lasso <- as.vector(coef(cv_lasso,s="lambda.min"))
SSE_lasso <- sum((Y-lasso_pred)^2)
df_lasso <- sum(abs(coef_lasso[-1])>1e-6)+1
AIC_lasso <- n*log(SSE_lasso/n)+2*df_lasso
BIC_lasso <- n*log(SSE_lasso/n)+log(n)*df_lasso
r2_lasso <- 1-SSE_lasso/sum((Y-mean(Y))^2)
r2adj_lasso <- 1-((1-r2_lasso)*(n-1)/(n-df_lasso))
results <- rbind(results,
data.frame(
Model="LASSO",
Lambda=round(lambda_lasso,4),
MSE=mean((Y-lasso_pred)^2),
RMSE=sqrt(mean((Y-lasso_pred)^2)),
R2=r2_lasso,
R2_adj=r2adj_lasso,
AIC=AIC_lasso,
BIC=BIC_lasso,
VIF=hitung_vif_post(coef_lasso,X_std,Y)
))
coef_list[["LASSO"]] <- coef_lasso
pred_list[["LASSO"]] <- lasso_pred
results[sapply(results,is.numeric)] <-
round(results[sapply(results,is.numeric)],4)
print(results)
## Model Lambda MSE RMSE R2 R2_adj AIC BIC
## 1 OLS 0.0000 1430838497 37826.43 0.7731 0.7717 34944.33 35002.48
## 2 Bridge γ=0.5 16.6750 1430838497 37826.43 0.7731 0.7717 30799.03 30851.89
## 3 Bridge γ=1 91.6675 1430838497 37826.43 0.7731 0.7717 30799.03 30851.89
## 4 Bridge γ=1.5 41.6725 1430839360 37826.44 0.7731 0.7717 30799.03 30851.89
## 5 Bridge γ=2 58.3375 1432868767 37853.25 0.7728 0.7714 30801.10 30853.96
## 6 Ridge 6281.6035 1436374214 37899.53 0.7722 0.7708 30804.67 30857.53
## 7 LASSO 453.5826 1434089879 37869.38 0.7726 0.7714 30800.34 30847.92
## VIF
## 1 122.6089
## 2 122.6089
## 3 122.6089
## 4 122.6089
## 5 122.6089
## 6 122.6089
## 7 5.1610
var_names <- c("Intercept", colnames(X_std))
coef_table <- do.call(cbind, coef_list)
coef_table <- as.data.frame(coef_table)
coef_table$Variable <- var_names
coef_table <- coef_table[, c(ncol(coef_table), 1:(ncol(coef_table)-1))]
coef_table[,-1] <- round(coef_table[,-1], 4)
print(coef_table)
## Variable OLS Bridge_0.5 Bridge_1 Bridge_1.5
## (Intercept) Intercept -1064422.5895 180921.196 180921.196 180921.196
## GrLivArea GrLivArea 12.2613 6443.093 6445.258 6763.834
## X1stFlrSF X1stFlrSF 49.1155 18987.463 18985.834 18745.888
## X2ndFlrSF X2ndFlrSF 36.0581 15740.367 15738.511 15469.158
## TotalBsmtSF TotalBsmtSF 21.2048 9302.661 9302.644 9303.965
## GarageArea GarageArea 18.0058 3849.724 3849.709 3850.723
## GarageCars GarageCars 9342.3599 6981.686 6981.710 6984.376
## OverallQual OverallQual 19504.7172 26974.957 26974.966 26970.597
## YearBuilt YearBuilt 219.0575 6616.173 6616.197 6621.986
## YearRemodAdd YearRemodAdd 281.5872 5813.482 5813.460 5812.789
## Bridge_2 Ridge LASSO
## (Intercept) 180921.196 180921.196 180921.196
## GrLivArea 14756.809 15005.942 24696.551
## X1stFlrSF 12341.885 11803.292 5134.090
## X2ndFlrSF 8754.677 8471.892 0.000
## TotalBsmtSF 9739.483 9975.497 9106.692
## GarageArea 4386.472 4783.326 3666.452
## GarageCars 7043.119 7095.080 7201.203
## OverallQual 25343.284 24076.715 27110.760
## YearBuilt 6921.872 7020.704 6734.110
## YearRemodAdd 6119.993 6359.966 5536.105
hitung_vif_per_variabel <- function(beta, X_std, y, nama_model){
vars <- colnames(X_std)[abs(beta[-1]) > 1e-6]
if(length(vars) < 2){
cat("\nModel", nama_model,
"hanya memiliki <=1 variabel aktif. VIF tidak dapat dihitung.\n")
return(NULL)
}
df <- as.data.frame(X_std)
colnames(df) <- colnames(X_std)
df$Y <- y
model_subset <- lm(
as.formula(paste("Y ~", paste(vars, collapse="+"))),
data=df
)
vif_vals <- vif(model_subset)
hasil <- data.frame(
Model = nama_model,
Variabel = names(vif_vals),
VIF = round(as.numeric(vif_vals),4)
)
return(hasil)
}
############################################################
# MEMBUAT DATA VIF DETAIL SEMUA MODEL
############################################################
vif_all_detail <- data.frame()
# OLS
vif_all_detail <- rbind(
vif_all_detail,
data.frame(
Model = "OLS",
Variabel = names(vif(model_ols)),
VIF = as.numeric(vif(model_ols))
)
)
# Bridge
for(g in gamma_grid){
vif_temp <- hitung_vif_per_variabel(
coef_list[[paste0("Bridge_",g)]],
X_std,
Y,
paste0("Bridge γ=",g)
)
if(!is.null(vif_temp)){
vif_all_detail <- rbind(vif_all_detail, vif_temp)
}
}
# Ridge
vif_ridge <- hitung_vif_per_variabel(
coef_list[["Ridge"]],
X_std,
Y,
"Ridge"
)
if(!is.null(vif_ridge)){
vif_all_detail <- rbind(vif_all_detail, vif_ridge)
}
# LASSO
vif_lasso <- hitung_vif_per_variabel(
coef_list[["LASSO"]],
X_std,
Y,
"LASSO"
)
if(!is.null(vif_lasso)){
vif_all_detail <- rbind(vif_all_detail, vif_lasso)
}
vif_all_detail$VIF <- round(vif_all_detail$VIF,4)
############################################################
# PIVOT TABEL VIF
############################################################
vif_pivot <- vif_all_detail %>%
pivot_wider(
names_from = Model,
values_from = VIF
)
print(vif_pivot)
## # A tibble: 9 × 8
## Variabel OLS `Bridge γ=0.5` `Bridge γ=1` `Bridge γ=1.5` `Bridge γ=2` Ridge
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 GrLivAr… 123. 123. 123. 123. 123. 123.
## 2 X1stFlr… 70.2 70.2 70.2 70.2 70.2 70.2
## 3 X2ndFlr… 87.1 87.1 87.1 87.1 87.1 87.1
## 4 TotalBs… 3.55 3.55 3.55 3.55 3.55 3.55
## 5 GarageA… 4.86 4.86 4.86 4.86 4.86 4.86
## 6 GarageC… 5.18 5.18 5.18 5.18 5.18 5.18
## 7 Overall… 2.74 2.74 2.74 2.74 2.74 2.74
## 8 YearBui… 2.16 2.16 2.16 2.16 2.16 2.16
## 9 YearRem… 1.73 1.73 1.73 1.73 1.73 1.73
## # ℹ 1 more variable: LASSO <dbl>
par(mfrow=c(3,3))
for(name in names(pred_list)){
plot(Y, pred_list[[name]],
main=name,
pch=16)
abline(0,1,col="red",lwd=2)
}
par(mfrow=c(1,1))