Nama anggota kelompok 4:
1. M. Syarlan (F1F023005)
2. A’inun Fauziyah (F1F023017)
3. Zahra Hana Andrea (F1F023027)
4. Chesa Nabilla Azzahra (F1F023040)

1. PERSIAPAN

library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

2. LOAD DATA

data <- read_excel(
  "D:/SEMESETER 6/KAPITA SELEKTA/house_prices_bridge.xlsx"
)
head(data)
## # A tibble: 6 × 10
##   SalePrice GrLivArea X1stFlrSF X2ndFlrSF TotalBsmtSF GarageArea GarageCars
##       <dbl>     <dbl>     <dbl>     <dbl>       <dbl>      <dbl>      <dbl>
## 1    208500      1710       856       854         856        548          2
## 2    181500      1262      1262         0        1262        460          2
## 3    223500      1786       920       866         920        608          2
## 4    140000      1717       961       756         756        642          3
## 5    250000      2198      1145      1053        1145        836          3
## 6    143000      1362       796       566         796        480          2
## # ℹ 3 more variables: OverallQual <dbl>, YearBuilt <dbl>, YearRemodAdd <dbl>

3. DEFINISI VARIABEL

X <- as.matrix(data[, c(
  "GrLivArea", "X1stFlrSF", "X2ndFlrSF",
  "TotalBsmtSF", "GarageArea", "GarageCars",
  "OverallQual", "YearBuilt", "YearRemodAdd"
)])

Y <- data$SalePrice
n <- length(Y)

# Standardisasi
X_std <- scale(X)
X_design <- cbind(Intercept = 1, X_std)

4. MODEL OLS

model_ols <- lm(SalePrice ~ GrLivArea + X1stFlrSF + X2ndFlrSF +
                  TotalBsmtSF + GarageArea + GarageCars +
                  OverallQual + YearBuilt + YearRemodAdd,
                data = data)

yhat_ols <- predict(model_ols)
coef_ols <- coef(model_ols)

max(vif(model_ols))
## [1] 122.6089
vif(model_ols)
##    GrLivArea    X1stFlrSF    X2ndFlrSF  TotalBsmtSF   GarageArea   GarageCars 
##   122.608908    70.202223    87.143487     3.545633     4.855265     5.178548 
##  OverallQual    YearBuilt YearRemodAdd 
##     2.736800     2.159474     1.730681

5. OBJECTIVE FUNCTION BRIDGE

bridge_objective <- function(beta, X, y, lambda, gamma){
  residual <- y - X %*% beta
  SSE <- sum(residual^2)
  penalty <- lambda * sum(abs(beta[-1])^gamma)
  return(SSE + penalty)
}

6. CROSS VALIDATION FUNCTION

bridge_cv <- function(X, y, lambda, gamma, K = 5){
  
  folds <- createFolds(y, k = K, list = TRUE)
  cv_error <- rep(0, K)
  
  for(i in 1:K){
    
    test_idx  <- folds[[i]]
    train_idx <- setdiff(1:length(y), test_idx)
    
    fit <- optim(
      par = rep(0, ncol(X)),
      fn = bridge_objective,
      X = X[train_idx, ],
      y = y[train_idx],
      lambda = lambda,
      gamma = gamma,
      method = "BFGS"
    )
    
    beta_hat <- fit$par
    y_pred <- X[test_idx, ] %*% beta_hat
    
    cv_error[i] <- mean((y[test_idx] - y_pred)^2)
  }
  
  return(mean(cv_error))
}

7. FUNGSI HITUNG VIF PASCA PENALIZED

hitung_vif_post <- function(beta, X_std, y){
  
  vars <- colnames(X_std)[abs(beta[-1]) > 1e-6]
  
  if(length(vars) < 2){
    return(NA)
  }
  
  df <- as.data.frame(X_std)
  colnames(df) <- colnames(X_std)
  df$Y <- y
  
  model_subset <- lm(
    as.formula(paste("Y ~", paste(vars, collapse="+"))),
    data=df
  )
  
  return(max(vif(model_subset)))
}

8. ESTIMASI SEMUA MODEL

gamma_grid <- c(0.5, 1, 1.5, 2)

results <- data.frame()
coef_list <- list()
pred_list <- list()

############################
# OLS
############################
SSE_ols <- sum((Y - yhat_ols)^2)
df_ols <- length(coef_ols)

results <- rbind(results,
                 data.frame(
                   Model="OLS",
                   Lambda=0,
                   MSE=mean((Y-yhat_ols)^2),
                   RMSE=sqrt(mean((Y-yhat_ols)^2)),
                   R2=summary(model_ols)$r.squared,
                   R2_adj=summary(model_ols)$adj.r.squared,
                   AIC=AIC(model_ols),
                   BIC=BIC(model_ols),
                   VIF=max(vif(model_ols))
                 ))

coef_list[["OLS"]] <- coef_ols
pred_list[["OLS"]] <- yhat_ols

############################
# BRIDGE
############################
for(g in gamma_grid){
  
  cat("Proses Bridge gamma =", g, "\n")
  
  lambda_grid <- seq(0.01,100,length=25)
  cv_vals <- sapply(lambda_grid,
                    function(lam) bridge_cv(X_design,Y,lam,g))
  
  best_lambda <- lambda_grid[which.min(cv_vals)]
  
  fit <- optim(
    par = rep(0,ncol(X_design)),
    fn = bridge_objective,
    X = X_design,
    y = Y,
    lambda = best_lambda,
    gamma = g,
    method="BFGS"
  )
  
  beta_hat <- fit$par
  yhat <- X_design %*% beta_hat
  
  SSE <- sum((Y-yhat)^2)
  df_eff <- sum(abs(beta_hat[-1])>1e-6) + 1
  
  AIC_pen <- n*log(SSE/n) + 2*df_eff
  BIC_pen <- n*log(SSE/n) + log(n)*df_eff
  
  r2 <- 1 - SSE/sum((Y-mean(Y))^2)
  r2_adj <- 1 - ((1-r2)*(n-1)/(n-df_eff))
  
  vif_post <- hitung_vif_post(beta_hat, X_std, Y)
  
  results <- rbind(results,
                   data.frame(
                     Model=paste0("Bridge γ=",g),
                     Lambda=round(best_lambda,4),
                     MSE=mean((Y-yhat)^2),
                     RMSE=sqrt(mean((Y-yhat)^2)),
                     R2=r2,
                     R2_adj=r2_adj,
                     AIC=AIC_pen,
                     BIC=BIC_pen,
                     VIF=vif_post
                   ))
  
  coef_list[[paste0("Bridge_",g)]] <- beta_hat
  pred_list[[paste0("Bridge γ=",g)]] <- yhat
}
## Proses Bridge gamma = 0.5 
## Proses Bridge gamma = 1 
## Proses Bridge gamma = 1.5 
## Proses Bridge gamma = 2
############################
# RIDGE
############################
cv_ridge <- cv.glmnet(X_std,Y,alpha=0)
lambda_ridge <- cv_ridge$lambda.min
ridge_pred <- predict(cv_ridge,X_std,s="lambda.min")
coef_ridge <- as.vector(coef(cv_ridge,s="lambda.min"))

SSE_ridge <- sum((Y-ridge_pred)^2)
df_ridge <- sum(abs(coef_ridge[-1])>1e-6)+1

AIC_ridge <- n*log(SSE_ridge/n)+2*df_ridge
BIC_ridge <- n*log(SSE_ridge/n)+log(n)*df_ridge

r2_ridge <- 1-SSE_ridge/sum((Y-mean(Y))^2)
r2adj_ridge <- 1-((1-r2_ridge)*(n-1)/(n-df_ridge))

results <- rbind(results,
                 data.frame(
                   Model="Ridge",
                   Lambda=round(lambda_ridge,4),
                   MSE=mean((Y-ridge_pred)^2),
                   RMSE=sqrt(mean((Y-ridge_pred)^2)),
                   R2=r2_ridge,
                   R2_adj=r2adj_ridge,
                   AIC=AIC_ridge,
                   BIC=BIC_ridge,
                   VIF=hitung_vif_post(coef_ridge,X_std,Y)
                 ))

coef_list[["Ridge"]] <- coef_ridge
pred_list[["Ridge"]] <- ridge_pred

############################
# LASSO
############################
cv_lasso <- cv.glmnet(X_std,Y,alpha=1)
lambda_lasso <- cv_lasso$lambda.min
lasso_pred <- predict(cv_lasso,X_std,s="lambda.min")
coef_lasso <- as.vector(coef(cv_lasso,s="lambda.min"))

SSE_lasso <- sum((Y-lasso_pred)^2)
df_lasso <- sum(abs(coef_lasso[-1])>1e-6)+1

AIC_lasso <- n*log(SSE_lasso/n)+2*df_lasso
BIC_lasso <- n*log(SSE_lasso/n)+log(n)*df_lasso

r2_lasso <- 1-SSE_lasso/sum((Y-mean(Y))^2)
r2adj_lasso <- 1-((1-r2_lasso)*(n-1)/(n-df_lasso))

results <- rbind(results,
                 data.frame(
                   Model="LASSO",
                   Lambda=round(lambda_lasso,4),
                   MSE=mean((Y-lasso_pred)^2),
                   RMSE=sqrt(mean((Y-lasso_pred)^2)),
                   R2=r2_lasso,
                   R2_adj=r2adj_lasso,
                   AIC=AIC_lasso,
                   BIC=BIC_lasso,
                   VIF=hitung_vif_post(coef_lasso,X_std,Y)
                 ))

coef_list[["LASSO"]] <- coef_lasso
pred_list[["LASSO"]] <- lasso_pred

9. TABEL FINAL

results[sapply(results,is.numeric)] <-
  round(results[sapply(results,is.numeric)],4)

print(results)
##          Model    Lambda        MSE     RMSE     R2 R2_adj      AIC      BIC
## 1          OLS    0.0000 1430838497 37826.43 0.7731 0.7717 34944.33 35002.48
## 2 Bridge γ=0.5   16.6750 1430838497 37826.43 0.7731 0.7717 30799.03 30851.89
## 3   Bridge γ=1   91.6675 1430838497 37826.43 0.7731 0.7717 30799.03 30851.89
## 4 Bridge γ=1.5   41.6725 1430839360 37826.44 0.7731 0.7717 30799.03 30851.89
## 5   Bridge γ=2   58.3375 1432868767 37853.25 0.7728 0.7714 30801.10 30853.96
## 6        Ridge 6281.6035 1436374214 37899.53 0.7722 0.7708 30804.67 30857.53
## 7        LASSO  453.5826 1434089879 37869.38 0.7726 0.7714 30800.34 30847.92
##        VIF
## 1 122.6089
## 2 122.6089
## 3 122.6089
## 4 122.6089
## 5 122.6089
## 6 122.6089
## 7   5.1610

MENAMPILKAN NILAI KOEFISIEN

var_names <- c("Intercept", colnames(X_std))
coef_table <- do.call(cbind, coef_list)
coef_table <- as.data.frame(coef_table)
coef_table$Variable <- var_names
coef_table <- coef_table[, c(ncol(coef_table), 1:(ncol(coef_table)-1))]
coef_table[,-1] <- round(coef_table[,-1], 4)

print(coef_table)
##                  Variable           OLS Bridge_0.5   Bridge_1 Bridge_1.5
## (Intercept)     Intercept -1064422.5895 180921.196 180921.196 180921.196
## GrLivArea       GrLivArea       12.2613   6443.093   6445.258   6763.834
## X1stFlrSF       X1stFlrSF       49.1155  18987.463  18985.834  18745.888
## X2ndFlrSF       X2ndFlrSF       36.0581  15740.367  15738.511  15469.158
## TotalBsmtSF   TotalBsmtSF       21.2048   9302.661   9302.644   9303.965
## GarageArea     GarageArea       18.0058   3849.724   3849.709   3850.723
## GarageCars     GarageCars     9342.3599   6981.686   6981.710   6984.376
## OverallQual   OverallQual    19504.7172  26974.957  26974.966  26970.597
## YearBuilt       YearBuilt      219.0575   6616.173   6616.197   6621.986
## YearRemodAdd YearRemodAdd      281.5872   5813.482   5813.460   5812.789
##                Bridge_2      Ridge      LASSO
## (Intercept)  180921.196 180921.196 180921.196
## GrLivArea     14756.809  15005.942  24696.551
## X1stFlrSF     12341.885  11803.292   5134.090
## X2ndFlrSF      8754.677   8471.892      0.000
## TotalBsmtSF    9739.483   9975.497   9106.692
## GarageArea     4386.472   4783.326   3666.452
## GarageCars     7043.119   7095.080   7201.203
## OverallQual   25343.284  24076.715  27110.760
## YearBuilt      6921.872   7020.704   6734.110
## YearRemodAdd   6119.993   6359.966   5536.105

VIF DETAIL PER VARIABEL

hitung_vif_per_variabel <- function(beta, X_std, y, nama_model){
  
  vars <- colnames(X_std)[abs(beta[-1]) > 1e-6]
  
  if(length(vars) < 2){
    cat("\nModel", nama_model, 
        "hanya memiliki <=1 variabel aktif. VIF tidak dapat dihitung.\n")
    return(NULL)
  }
  
  df <- as.data.frame(X_std)
  colnames(df) <- colnames(X_std)
  df$Y <- y
  
  model_subset <- lm(
    as.formula(paste("Y ~", paste(vars, collapse="+"))),
    data=df
  )
  
  vif_vals <- vif(model_subset)
  
  hasil <- data.frame(
    Model = nama_model,
    Variabel = names(vif_vals),
    VIF = round(as.numeric(vif_vals),4)
  )
  
  return(hasil)
}

############################################################
# MEMBUAT DATA VIF DETAIL SEMUA MODEL
############################################################

vif_all_detail <- data.frame()

# OLS
vif_all_detail <- rbind(
  vif_all_detail,
  data.frame(
    Model = "OLS",
    Variabel = names(vif(model_ols)),
    VIF = as.numeric(vif(model_ols))
  )
)

# Bridge
for(g in gamma_grid){
  vif_temp <- hitung_vif_per_variabel(
    coef_list[[paste0("Bridge_",g)]],
    X_std,
    Y,
    paste0("Bridge γ=",g)
  )
  
  if(!is.null(vif_temp)){
    vif_all_detail <- rbind(vif_all_detail, vif_temp)
  }
}

# Ridge
vif_ridge <- hitung_vif_per_variabel(
  coef_list[["Ridge"]],
  X_std,
  Y,
  "Ridge"
)

if(!is.null(vif_ridge)){
  vif_all_detail <- rbind(vif_all_detail, vif_ridge)
}

# LASSO
vif_lasso <- hitung_vif_per_variabel(
  coef_list[["LASSO"]],
  X_std,
  Y,
  "LASSO"
)

if(!is.null(vif_lasso)){
  vif_all_detail <- rbind(vif_all_detail, vif_lasso)
}

vif_all_detail$VIF <- round(vif_all_detail$VIF,4)

############################################################
# PIVOT TABEL VIF
############################################################

vif_pivot <- vif_all_detail %>%
  pivot_wider(
    names_from = Model,
    values_from = VIF
  )

print(vif_pivot)
## # A tibble: 9 × 8
##   Variabel    OLS `Bridge γ=0.5` `Bridge γ=1` `Bridge γ=1.5` `Bridge γ=2`  Ridge
##   <chr>     <dbl>          <dbl>        <dbl>          <dbl>        <dbl>  <dbl>
## 1 GrLivAr… 123.           123.         123.           123.         123.   123.  
## 2 X1stFlr…  70.2           70.2         70.2           70.2         70.2   70.2 
## 3 X2ndFlr…  87.1           87.1         87.1           87.1         87.1   87.1 
## 4 TotalBs…   3.55           3.55         3.55           3.55         3.55   3.55
## 5 GarageA…   4.86           4.86         4.86           4.86         4.86   4.86
## 6 GarageC…   5.18           5.18         5.18           5.18         5.18   5.18
## 7 Overall…   2.74           2.74         2.74           2.74         2.74   2.74
## 8 YearBui…   2.16           2.16         2.16           2.16         2.16   2.16
## 9 YearRem…   1.73           1.73         1.73           1.73         1.73   1.73
## # ℹ 1 more variable: LASSO <dbl>

10. PLOT AKTUAL VS PREDIKSI

par(mfrow=c(3,3))

for(name in names(pred_list)){
  plot(Y, pred_list[[name]],
       main=name,
       pch=16)
  abline(0,1,col="red",lwd=2)
}

par(mfrow=c(1,1))