library(car)
library(lmtest)
library(sandwich)
library(nlme)
library(splines)
library(caret)
library(ggplot2)
library(knitr)
library(broom)
library(kableExtra)
set.seed(123)
1) Diagnostik lengkap OLS pada data mtcars
———————————————————————–
- Heteroskedastisitas (Breusch–Pagan)
- Multikolinearitas (VIF)
- Pengaruh (Cook’s D)
- Normalitas residual (QQ-plot)
- Laporkan temuan & rekomendasi
———————————————————————–
data(mtcars)
head(mtcars[,c("mpg", "wt")])
## mpg wt
## Mazda RX4 21.0 2.620
## Mazda RX4 Wag 21.0 2.875
## Datsun 710 22.8 2.320
## Hornet 4 Drive 21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant 18.1 3.460
m_ols <- lm(mpg ~ wt + hp + disp + cyl, data = mtcars)
kable(tidy(m_ols), caption = "Hasil Regresi Linear OLS (mtcars)")
Hasil Regresi Linear OLS (mtcars)
| (Intercept) |
40.8285367 |
2.7574679 |
14.8065319 |
0.0000000 |
| wt |
-3.8539035 |
1.0154736 |
-3.7951783 |
0.0007589 |
| hp |
-0.0205384 |
0.0121468 |
-1.6908508 |
0.1023791 |
| disp |
0.0115992 |
0.0117268 |
0.9891215 |
0.3313856 |
| cyl |
-1.2933197 |
0.6558768 |
-1.9718944 |
0.0589468 |
(a) Uji Breusch–Pagan untuk heteroskedastisitas
bp_test <- bptest(m_ols)
kable(tidy(bp_test), caption = "Hasil Uji Breusch–Pagan")
Hasil Uji Breusch–Pagan
| 2.762312 |
0.5983558 |
4 |
studentized Breusch-Pagan test |
(b) VIF untuk mengecek multikolinearitas
VIF
vif_tbl <- vif(m_ols)
kable(as.data.frame(t(vif_tbl)), caption = "Tabel VIF untuk Mengecek Multikolinearitas")
Tabel VIF untuk Mengecek Multikolinearitas
| 4.848015 |
3.405982 |
10.37329 |
6.737706 |
(c) Observasi berpengaruh: Cook’s Distance
cooks <- cooks.distance(m_ols)
inf_tbl <- data.frame(obs = names(cooks), cooksD = as.numeric(cooks))
inf_tbl <- inf_tbl[order(-inf_tbl$cooksD), ]
kable(head(inf_tbl, 5), caption = "Top 5 Cook's Distance")
Top 5 Cook’s Distance
| 17 |
Chrysler Imperial |
0.2553628 |
| 31 |
Maserati Bora |
0.1866417 |
| 20 |
Toyota Corolla |
0.1335216 |
| 18 |
Fiat 128 |
0.1020489 |
| 21 |
Toyota Corona |
0.0598308 |
Plot Cook’s D dan leverage
plot(m_ols, which = 4) # Cook's distance plot

influencePlot(m_ols, id = list(method = "identify"), main = "Influence Plot")

(d) Normalitas residual: QQ-plot + Shapiro-Wilk (opsional)
plot(m_ols, which = 2) # QQ-plot

## Shapiro–Wilk Residual Normality (opsional)
shapiro.test(residuals(m_ols))
##
## Shapiro-Wilk normality test
##
## data: residuals(m_ols)
## W = 0.93705, p-value = 0.06177
Catatan & Rekomendasi
- Jika BP-test signifikan → gunakan robust SE (HC3)
atau WLS.
- Jika VIF > 5–10 → pertimbangkan
reduksi/transformasi variabel.
- Jika ada outlier/influential → cek data atau
gunakan model robust.
- Jika residual tidak normal → pertimbangkan
transformasi (mis. log).
2) Robust SE (HC3) vs WLS pada data simulasi
heteroskedastik
———————————————————————–
Bandingkan standard error dan p-value
———————————————————————–
# Simulasi data heteroskedastik
set.seed(123)
n <- 400
x <- runif(n, 0.5, 3)
sigma2 <- 0.5^2 * x^2 # Varian error ∝ x^2
err <- rnorm(n, 0, sqrt(sigma2))
y <- 1 + 2*x + err
sim_df <- data.frame(y, x, w = 1/sigma2)
OLS biasa
m_sim_ols <- lm(y ~ x, data = sim_df)
summary(m_sim_ols)
##
## Call:
## lm(formula = y ~ x, data = sim_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1350 -0.4566 -0.0147 0.4700 2.8978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.07165 0.12505 8.57 2.32e-16 ***
## x 1.97592 0.06642 29.75 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9408 on 398 degrees of freedom
## Multiple R-squared: 0.6898, Adjusted R-squared: 0.689
## F-statistic: 884.9 on 1 and 398 DF, p-value: < 2.2e-16
OLS + Robust SE (HC3)
robust_se <- coeftest(m_sim_ols, vcov. = vcovHC(m_sim_ols, type = "HC3"))
robust_se
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.071648 0.108689 9.8598 < 2.2e-16 ***
## x 1.975917 0.077693 25.4323 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
WLS
m_sim_wls <- lm(y ~ x, data = sim_df, weights = w)
summary(m_sim_wls);m_sim_wls<- lm(y ~ x, data = sim_df, weights = w)
##
## Call:
## lm(formula = y ~ x, data = sim_df, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.87524 -0.63752 0.00532 0.66145 2.55400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.96110 0.06716 14.31 <2e-16 ***
## x 2.04595 0.05340 38.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9801 on 398 degrees of freedom
## Multiple R-squared: 0.7867, Adjusted R-squared: 0.7862
## F-statistic: 1468 on 1 and 398 DF, p-value: < 2.2e-16
summary(m_sim_wls)
##
## Call:
## lm(formula = y ~ x, data = sim_df, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.87524 -0.63752 0.00532 0.66145 2.55400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.96110 0.06716 14.31 <2e-16 ***
## x 2.04595 0.05340 38.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9801 on 398 degrees of freedom
## Multiple R-squared: 0.7867, Adjusted R-squared: 0.7862
## F-statistic: 1468 on 1 and 398 DF, p-value: < 2.2e-16
Bandingkan SE & p-value dalam satu tabel ringkas
cmp <- data.frame(
Model = c("OLS", "OLS + HC3", "WLS"),
SE_x = c(coef(summary(m_sim_ols))["x","Std. Error"],
robust_se["x","Std. Error"],
coef(summary(m_sim_wls))["x","Std. Error"]),
p_x = c(coef(summary(m_sim_ols))["x","Pr(>|t|)"],
robust_se["x","Pr(>|t|)"],
coef(summary(m_sim_wls))["x","Pr(>|t|)"])
)
cmp$Signifikansi <- ifelse(cmp$p_x < 0.05, "Signifikan", "Tidak Signifikan")
cmp$Interpretasi <- c(
"OLS biasa (tidak koreksi heteroskedastisitas)",
"OLS + robust SE (lebih reliabel jika heteroskedastis)",
"WLS (memperhitungkan varian error, efisien jika bobot benar)"
)
cmp %>%
kbl(caption = "Perbandingan Standard Error, p-value, dan Interpretasi (x)", digits = 4) %>%
kable_paper("hover", full_width = F) %>%
column_spec(4,
background = ifelse(cmp$Signifikansi == "Signifikan", "#b2df8a", "#fb9a99"),
color = "black", bold = TRUE)
Perbandingan Standard Error, p-value, dan Interpretasi (x)
|
Model
|
SE_x
|
p_x
|
Signifikansi
|
Interpretasi
|
|
OLS
|
0.0664
|
0
|
Signifikan
|
OLS biasa (tidak koreksi heteroskedastisitas)
|
|
OLS + HC3
|
0.0777
|
0
|
Signifikan
|
OLS + robust SE (lebih reliabel jika heteroskedastis)
|
|
WLS
|
0.0534
|
0
|
Signifikan
|
WLS (memperhitungkan varian error, efisien jika bobot benar)
|
3) Spline vs Polinomial: mpg ~ wt
———————————————————————–
- Bandingkan AIC/BIC
- 5-fold Cross-Validation (RMSE)
- Visualisasi fit
———————————————————————–
# Polinomial derajat 3 (raw=TRUE agar koefisien mudah dibaca)
poly_m3 <- lm(mpg ~ poly(wt, 3, raw = TRUE), data = mtcars)
# Spline B-spline dengan df=4
library(splines)
spline_df4 <- lm(mpg ~ bs(wt, df = 4), data = mtcars)
# ---------------------------
# AIC / BIC
# ---------------------------
AIC_BIC <- rbind(
Poly3 = c(AIC(poly_m3), BIC(poly_m3)),
BS_df4 = c(AIC(spline_df4), BIC(spline_df4))
)
colnames(AIC_BIC) <- c("AIC","BIC")
print(AIC_BIC)
## AIC BIC
## Poly3 160.0365 167.3652
## BS_df4 161.7642 170.5586
# ---------------------------
# 5-fold Cross-Validation
# ---------------------------
library(caret)
set.seed(123)
ctrl <- trainControl(method = "cv", number = 5)
cv_poly <- train(mpg ~ poly(wt, 3, raw = TRUE), data = mtcars,
method = "lm", trControl = ctrl)
cv_spline <- train(mpg ~ bs(wt, df = 4), data = mtcars,
method = "lm", trControl = ctrl)
# Ringkasan hasil CV
cv_cmp <- data.frame(
Model = c("Poly (deg=3)", "B-spline (df=4)"),
RMSE = c(cv_poly$results$RMSE, cv_spline$results$RMSE),
Rsq = c(cv_poly$results$Rsquared, cv_spline$results$Rsquared)
)
print(cv_cmp)
## Model RMSE Rsq
## 1 Poly (deg=3) 2.983168 0.8052949
## 2 B-spline (df=4) 3.079210 0.8253110
# ---------------------------
# Visualisasi
# ---------------------------
library(ggplot2)
wt_seq <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 200))
wt_seq$fit_poly <- predict(poly_m3, newdata = wt_seq)
wt_seq$fit_spl <- predict(spline_df4, newdata = wt_seq)
p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "black") +
geom_line(data = wt_seq, aes(y = fit_poly), color = "blue", linewidth = 0.7) +
geom_line(data = wt_seq, aes(y = fit_spl), color = "red", linetype = 2, linewidth = 0.7) +
labs(title = "mpg ~ wt : Poly(3) vs B-spline(df=4)",
subtitle = "Garis biru = Poly3; Garis merah putus-putus = Spline",
y = "mpg", x = "wt")
print(p)

4) GLS AR(1): gunakan sinyal musiman x = sin(2 * pi * t /
12)
————————————————————–
- Bandingkan SE OLS vs GLS
- Interpretasikan rho-hat
————————————————————–
# Simulasi data
T <- 120 # 10 tahun data bulanan
t <- 1:T
x <- sin(2 * pi * t / 12)
# Error AR(1)
rho_true <- 0.6
e <- numeric(T)
e[1] <- rnorm(1, 0, 1)
for (i in 2:T) {
e[i] <- rho_true * e[i-1] + rnorm(1, 0, 1)
}
# Respon
y <- 10 + 2*x + e
ts_df <- data.frame(t, y, x)
# ---------------------------
# OLS
# ---------------------------
ols_ts <- lm(y ~ x, data = ts_df)
summary(ols_ts)
##
## Call:
## lm(formula = y ~ x, data = ts_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5877 -0.8313 -0.0135 0.7255 3.9272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.5535 0.1098 96.14 <2e-16 ***
## x 2.1677 0.1552 13.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.202 on 118 degrees of freedom
## Multiple R-squared: 0.623, Adjusted R-squared: 0.6198
## F-statistic: 195 on 1 and 118 DF, p-value: < 2.2e-16
# ---------------------------
# GLS dengan AR(1)
# ---------------------------
library(nlme)
gls_ts <- gls(y ~ x, data = ts_df,
correlation = corAR1(form = ~ t))
summary(gls_ts)
## Generalized least squares fit by REML
## Model: y ~ x
## Data: ts_df
## AIC BIC logLik
## 347.0178 358.1005 -169.5089
##
## Correlation Structure: AR(1)
## Formula: ~t
## Parameter estimate(s):
## Phi
## 0.6186848
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 10.579103 0.2340985 45.19081 0
## x 2.252921 0.2299342 9.79811 0
##
## Correlation:
## (Intr)
## x -0.006
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.806166876 -0.719723144 0.001562097 0.583375820 3.093204586
##
## Residual standard error: 1.261349
## Degrees of freedom: 120 total; 118 residual
# ---------------------------
# Estimasi rho (AR1)
# ---------------------------
rho_hat <- coef(gls_ts$modelStruct$corStruct, unconstrained = FALSE)
cat("\nEstimasi rho (AR1) dari GLS = ", round(rho_hat, 3), "\n")
##
## Estimasi rho (AR1) dari GLS = 0.619
# ---------------------------
# Bandingkan SE koefisien x
# ---------------------------
SE_x_ols <- coef(summary(ols_ts))["x", "Std. Error"]
SE_x_gls <- summary(gls_ts)$tTable["x", "Std.Error"]
SE_cmp <- data.frame(
Model = c("OLS", "GLS AR(1)"),
SE_x = c(SE_x_ols, SE_x_gls)
)
print(SE_cmp)
## Model SE_x
## 1 OLS 0.1552368
## 2 GLS AR(1) 0.2299342
# --------------------------------------------------------------
# Interpretasi:
# - Rho-hat > 0 → autokorelasi positif pada error.
# - Jika SE(GLS) < SE(OLS), berarti GLS lebih efisien di hadapan AR(1).
# - Koefisien x ≈ amplitudo sinyal musiman sinus 12-bulanan.
# --------------------------------------------------------------