library(car)
library(lmtest) 
library(sandwich) 
library(nlme) 
library(splines) 
library(caret) 
library(ggplot2)
library(knitr)
library(broom)
library(kableExtra)
set.seed(123)

1) Diagnostik lengkap OLS pada data mtcars

———————————————————————–

- Heteroskedastisitas (Breusch–Pagan)

- Multikolinearitas (VIF)

- Pengaruh (Cook’s D)

- Normalitas residual (QQ-plot)

- Laporkan temuan & rekomendasi

———————————————————————–

data(mtcars)
head(mtcars[,c("mpg", "wt")])
##                    mpg    wt
## Mazda RX4         21.0 2.620
## Mazda RX4 Wag     21.0 2.875
## Datsun 710        22.8 2.320
## Hornet 4 Drive    21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant           18.1 3.460
m_ols <- lm(mpg ~ wt + hp + disp + cyl, data = mtcars)
kable(tidy(m_ols), caption = "Hasil Regresi Linear OLS (mtcars)")
Hasil Regresi Linear OLS (mtcars)
term estimate std.error statistic p.value
(Intercept) 40.8285367 2.7574679 14.8065319 0.0000000
wt -3.8539035 1.0154736 -3.7951783 0.0007589
hp -0.0205384 0.0121468 -1.6908508 0.1023791
disp 0.0115992 0.0117268 0.9891215 0.3313856
cyl -1.2933197 0.6558768 -1.9718944 0.0589468

(a) Uji Breusch–Pagan untuk heteroskedastisitas

bp_test <- bptest(m_ols)
kable(tidy(bp_test), caption = "Hasil Uji Breusch–Pagan")
Hasil Uji Breusch–Pagan
statistic p.value parameter method
2.762312 0.5983558 4 studentized Breusch-Pagan test

(b) VIF untuk mengecek multikolinearitas

VIF

vif_tbl <- vif(m_ols)
kable(as.data.frame(t(vif_tbl)), caption = "Tabel VIF untuk Mengecek Multikolinearitas")
Tabel VIF untuk Mengecek Multikolinearitas
wt hp disp cyl
4.848015 3.405982 10.37329 6.737706

(c) Observasi berpengaruh: Cook’s Distance

cooks <- cooks.distance(m_ols)
inf_tbl <- data.frame(obs = names(cooks), cooksD = as.numeric(cooks))
inf_tbl <- inf_tbl[order(-inf_tbl$cooksD), ]

kable(head(inf_tbl, 5), caption = "Top 5 Cook's Distance")
Top 5 Cook’s Distance
obs cooksD
17 Chrysler Imperial 0.2553628
31 Maserati Bora 0.1866417
20 Toyota Corolla 0.1335216
18 Fiat 128 0.1020489
21 Toyota Corona 0.0598308

Plot Cook’s D dan leverage

plot(m_ols, which = 4) # Cook's distance plot

influencePlot(m_ols, id = list(method = "identify"), main = "Influence Plot")

(d) Normalitas residual: QQ-plot + Shapiro-Wilk (opsional)

plot(m_ols, which = 2) # QQ-plot

## Shapiro–Wilk Residual Normality (opsional)
shapiro.test(residuals(m_ols))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(m_ols)
## W = 0.93705, p-value = 0.06177

Catatan & Rekomendasi

  • Jika BP-test signifikan → gunakan robust SE (HC3) atau WLS.
  • Jika VIF > 5–10 → pertimbangkan reduksi/transformasi variabel.
  • Jika ada outlier/influential → cek data atau gunakan model robust.
  • Jika residual tidak normal → pertimbangkan transformasi (mis. log).

2) Robust SE (HC3) vs WLS pada data simulasi heteroskedastik

———————————————————————–

Bandingkan standard error dan p-value

———————————————————————–

# Simulasi data heteroskedastik
set.seed(123)
n <- 400
x <- runif(n, 0.5, 3)
sigma2 <- 0.5^2 * x^2       # Varian error ∝ x^2
err <- rnorm(n, 0, sqrt(sigma2))
y <- 1 + 2*x + err
sim_df <- data.frame(y, x, w = 1/sigma2)

OLS biasa

m_sim_ols <- lm(y ~ x, data = sim_df)
summary(m_sim_ols)
## 
## Call:
## lm(formula = y ~ x, data = sim_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1350 -0.4566 -0.0147  0.4700  2.8978 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.07165    0.12505    8.57 2.32e-16 ***
## x            1.97592    0.06642   29.75  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9408 on 398 degrees of freedom
## Multiple R-squared:  0.6898, Adjusted R-squared:  0.689 
## F-statistic: 884.9 on 1 and 398 DF,  p-value: < 2.2e-16

OLS + Robust SE (HC3)

robust_se <- coeftest(m_sim_ols, vcov. = vcovHC(m_sim_ols, type = "HC3"))
robust_se
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 1.071648   0.108689  9.8598 < 2.2e-16 ***
## x           1.975917   0.077693 25.4323 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WLS

m_sim_wls <- lm(y ~ x, data = sim_df, weights = w)
summary(m_sim_wls);m_sim_wls<- lm(y ~ x, data = sim_df, weights = w)
## 
## Call:
## lm(formula = y ~ x, data = sim_df, weights = w)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87524 -0.63752  0.00532  0.66145  2.55400 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.96110    0.06716   14.31   <2e-16 ***
## x            2.04595    0.05340   38.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9801 on 398 degrees of freedom
## Multiple R-squared:  0.7867, Adjusted R-squared:  0.7862 
## F-statistic:  1468 on 1 and 398 DF,  p-value: < 2.2e-16
summary(m_sim_wls)
## 
## Call:
## lm(formula = y ~ x, data = sim_df, weights = w)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87524 -0.63752  0.00532  0.66145  2.55400 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.96110    0.06716   14.31   <2e-16 ***
## x            2.04595    0.05340   38.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9801 on 398 degrees of freedom
## Multiple R-squared:  0.7867, Adjusted R-squared:  0.7862 
## F-statistic:  1468 on 1 and 398 DF,  p-value: < 2.2e-16

Bandingkan SE & p-value dalam satu tabel ringkas

cmp <- data.frame(
  Model = c("OLS", "OLS + HC3", "WLS"),
  SE_x = c(coef(summary(m_sim_ols))["x","Std. Error"],
           robust_se["x","Std. Error"],
           coef(summary(m_sim_wls))["x","Std. Error"]),
  p_x = c(coef(summary(m_sim_ols))["x","Pr(>|t|)"],
          robust_se["x","Pr(>|t|)"],
          coef(summary(m_sim_wls))["x","Pr(>|t|)"])
)

cmp$Signifikansi <- ifelse(cmp$p_x < 0.05, "Signifikan", "Tidak Signifikan")

cmp$Interpretasi <- c(
  "OLS biasa (tidak koreksi heteroskedastisitas)",
  "OLS + robust SE (lebih reliabel jika heteroskedastis)",
  "WLS (memperhitungkan varian error, efisien jika bobot benar)"
)

cmp %>%
  kbl(caption = "Perbandingan Standard Error, p-value, dan Interpretasi (x)", digits = 4) %>%
  kable_paper("hover", full_width = F) %>%
  column_spec(4, 
              background = ifelse(cmp$Signifikansi == "Signifikan", "#b2df8a", "#fb9a99"),
              color = "black", bold = TRUE)
Perbandingan Standard Error, p-value, dan Interpretasi (x)
Model SE_x p_x Signifikansi Interpretasi
OLS 0.0664 0 Signifikan OLS biasa (tidak koreksi heteroskedastisitas)
OLS + HC3 0.0777 0 Signifikan OLS + robust SE (lebih reliabel jika heteroskedastis)
WLS 0.0534 0 Signifikan WLS (memperhitungkan varian error, efisien jika bobot benar)

3) Spline vs Polinomial: mpg ~ wt

———————————————————————–

- Bandingkan AIC/BIC

- 5-fold Cross-Validation (RMSE)

- Visualisasi fit

———————————————————————–

# Polinomial derajat 3 (raw=TRUE agar koefisien mudah dibaca)
poly_m3 <- lm(mpg ~ poly(wt, 3, raw = TRUE), data = mtcars)

# Spline B-spline dengan df=4
library(splines)
spline_df4 <- lm(mpg ~ bs(wt, df = 4), data = mtcars)

# ---------------------------
# AIC / BIC
# ---------------------------
AIC_BIC <- rbind(
  Poly3   = c(AIC(poly_m3),    BIC(poly_m3)),
  BS_df4  = c(AIC(spline_df4), BIC(spline_df4))
)
colnames(AIC_BIC) <- c("AIC","BIC")
print(AIC_BIC)
##             AIC      BIC
## Poly3  160.0365 167.3652
## BS_df4 161.7642 170.5586
# ---------------------------
# 5-fold Cross-Validation
# ---------------------------
library(caret)
set.seed(123)
ctrl <- trainControl(method = "cv", number = 5)

cv_poly <- train(mpg ~ poly(wt, 3, raw = TRUE), data = mtcars,
                 method = "lm", trControl = ctrl)

cv_spline <- train(mpg ~ bs(wt, df = 4), data = mtcars,
                   method = "lm", trControl = ctrl)

# Ringkasan hasil CV
cv_cmp <- data.frame(
  Model = c("Poly (deg=3)", "B-spline (df=4)"),
  RMSE  = c(cv_poly$results$RMSE, cv_spline$results$RMSE),
  Rsq   = c(cv_poly$results$Rsquared, cv_spline$results$Rsquared)
)
print(cv_cmp)
##             Model     RMSE       Rsq
## 1    Poly (deg=3) 2.983168 0.8052949
## 2 B-spline (df=4) 3.079210 0.8253110
# ---------------------------
# Visualisasi
# ---------------------------
library(ggplot2)

wt_seq <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 200))
wt_seq$fit_poly <- predict(poly_m3, newdata = wt_seq)
wt_seq$fit_spl  <- predict(spline_df4, newdata = wt_seq)

p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(color = "black") +
  geom_line(data = wt_seq, aes(y = fit_poly), color = "blue", linewidth = 0.7) +
  geom_line(data = wt_seq, aes(y = fit_spl), color = "red", linetype = 2, linewidth = 0.7) +
  labs(title = "mpg ~ wt : Poly(3) vs B-spline(df=4)",
       subtitle = "Garis biru = Poly3; Garis merah putus-putus = Spline",
       y = "mpg", x = "wt")
print(p)

4) GLS AR(1): gunakan sinyal musiman x = sin(2 * pi * t / 12)

————————————————————–

- Bandingkan SE OLS vs GLS

- Interpretasikan rho-hat

————————————————————–

# Simulasi data
T <- 120 # 10 tahun data bulanan
t <- 1:T
x <- sin(2 * pi * t / 12)

# Error AR(1)
rho_true <- 0.6
e <- numeric(T)
e[1] <- rnorm(1, 0, 1)
for (i in 2:T) {
  e[i] <- rho_true * e[i-1] + rnorm(1, 0, 1)
}

# Respon
y <- 10 + 2*x + e
ts_df <- data.frame(t, y, x)

# ---------------------------
# OLS
# ---------------------------
ols_ts <- lm(y ~ x, data = ts_df)
summary(ols_ts)
## 
## Call:
## lm(formula = y ~ x, data = ts_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5877 -0.8313 -0.0135  0.7255  3.9272 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.5535     0.1098   96.14   <2e-16 ***
## x             2.1677     0.1552   13.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.202 on 118 degrees of freedom
## Multiple R-squared:  0.623,  Adjusted R-squared:  0.6198 
## F-statistic:   195 on 1 and 118 DF,  p-value: < 2.2e-16
# ---------------------------
# GLS dengan AR(1)
# ---------------------------
library(nlme)
gls_ts <- gls(y ~ x, data = ts_df,
              correlation = corAR1(form = ~ t))
summary(gls_ts)
## Generalized least squares fit by REML
##   Model: y ~ x 
##   Data: ts_df 
##        AIC      BIC    logLik
##   347.0178 358.1005 -169.5089
## 
## Correlation Structure: AR(1)
##  Formula: ~t 
##  Parameter estimate(s):
##       Phi 
## 0.6186848 
## 
## Coefficients:
##                 Value Std.Error  t-value p-value
## (Intercept) 10.579103 0.2340985 45.19081       0
## x            2.252921 0.2299342  9.79811       0
## 
##  Correlation: 
##   (Intr)
## x -0.006
## 
## Standardized residuals:
##          Min           Q1          Med           Q3          Max 
## -2.806166876 -0.719723144  0.001562097  0.583375820  3.093204586 
## 
## Residual standard error: 1.261349 
## Degrees of freedom: 120 total; 118 residual
# ---------------------------
# Estimasi rho (AR1)
# ---------------------------
rho_hat <- coef(gls_ts$modelStruct$corStruct, unconstrained = FALSE)
cat("\nEstimasi rho (AR1) dari GLS = ", round(rho_hat, 3), "\n")
## 
## Estimasi rho (AR1) dari GLS =  0.619
# ---------------------------
# Bandingkan SE koefisien x
# ---------------------------
SE_x_ols <- coef(summary(ols_ts))["x", "Std. Error"]
SE_x_gls <- summary(gls_ts)$tTable["x", "Std.Error"]

SE_cmp <- data.frame(
  Model = c("OLS", "GLS AR(1)"),
  SE_x  = c(SE_x_ols, SE_x_gls)
)
print(SE_cmp)
##       Model      SE_x
## 1       OLS 0.1552368
## 2 GLS AR(1) 0.2299342
# --------------------------------------------------------------
# Interpretasi:
# - Rho-hat > 0  → autokorelasi positif pada error.
# - Jika SE(GLS) < SE(OLS), berarti GLS lebih efisien di hadapan AR(1).
# - Koefisien x ≈ amplitudo sinyal musiman sinus 12-bulanan.
# --------------------------------------------------------------