Model persamaan untuk setiap pengamatan \(i=1,\ldots,n\):
\[ Y_i = \beta_0 + \beta_1 X_{i} + \varepsilon_i, \qquad i=1,\ldots,n \] dengan
Data Gunakan dataset mtcars untuk melihat
hubungan mpg (miles per gallon) dengan wt
(berat mobil).
data(mtcars)
head(mtcars[, c("mpg", "wt")])
## mpg wt
## Mazda RX4 21.0 2.620
## Mazda RX4 Wag 21.0 2.875
## Datsun 710 22.8 2.320
## Hornet 4 Drive 21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant 18.1 3.460
Korelasi
cor.test(mtcars$wt, mtcars$mpg)
##
## Pearson's product-moment correlation
##
## data: mtcars$wt and mtcars$mpg
## t = -9.559, df = 30, p-value = 1.294e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9338264 -0.7440872
## sample estimates:
## cor
## -0.8676594
Regresi
model <- lm(mpg ~ wt, data = mtcars)
summary(model)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
Visualisasi
library(ggplot2)
ggplot(mtcars, aes(x = wt, y= mpg)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Hubungan antara Berat Mobil (wt) dan Efisiensi Bahan Bakar (mpg)",
x= "Berat Mobil (1000 lbs)",
y= "Miles per Gallon (mpg)")
dat1 <- mtcars %>%
as_tibble(rownames = "car") %>%
mutate(across(c(mpg, disp, hp, wt, qsec), as.double))
# Plot hubungan awal
GG1 <- ggplot(dat1, aes(wt, mpg)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
labs(x= "Weight (1000 lbs)", y= "Miles per Gallon (mpg)",
title = "Hubungan mpg vs wt pada mtcars")
GG1
fit1 <- lm(mpg ~ wt + hp + disp, data = dat1)
summary(fit1)
##
## Call:
## lm(formula = mpg ~ wt + hp + disp, data = dat1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.891 -1.640 -0.172 1.061 5.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.105505 2.110815 17.579 < 2e-16 ***
## wt -3.800891 1.066191 -3.565 0.00133 **
## hp -0.031157 0.011436 -2.724 0.01097 *
## disp -0.000937 0.010350 -0.091 0.92851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083
## F-statistic: 44.57 on 3 and 28 DF, p-value: 8.65e-11
#Koefisien dengan CI 95%
tidy(fit1,conf.int= TRUE)
## # A tibble: 4 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 37.1 2.11 17.6 1.16e-16 32.8 41.4
## 2 wt -3.80 1.07 -3.56 1.33e- 3 -5.98 -1.62
## 3 hp -0.0312 0.0114 -2.72 1.10e- 2 -0.0546 -0.00773
## 4 disp -0.000937 0.0103 -0.0905 9.29e- 1 -0.0221 0.0203
diag_df <-tibble(
fitted= fitted(fit1), resid= resid(fit1),
stdres= rstandard(fit1), hat= hatvalues(fit1), cook= cooks.distance(fit1)
)
p1<-ggplot(diag_df, aes(fitted,resid)) +
geom_point(alpha= 0.7) +
geom_hline(yintercept= 0,linetype= 2) +
labs(x= "Fitted", y = "Residuals")
p2<-ggplot(diag_df, aes(sample= stdres))+
stat_qq()+ stat_qq_line()+ labs(x= "Theoretical", y= "StandardizedResiduals")
p1;p2
qplot(seq_along(diag_df$cook),diag_df$cook, geom= c("line","point"), xlab= "Index", ylab= "Cook'sD")
car::vif(fit1)
## wt hp disp
## 4.844618 2.736633 7.324517
coeftest(fit1)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.10550527 2.11081525 17.5788 < 2.2e-16 ***
## wt -3.80089058 1.06619064 -3.5649 0.001331 **
## hp -0.03115655 0.01143579 -2.7245 0.010971 *
## disp -0.00093701 0.01034974 -0.0905 0.928507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit1, vcov= vcovHC(fit1, type= "HC3")) # SE robust (HC3)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.10550527 2.71026981 13.6907 6.257e-14 ***
## wt -3.80089058 1.08086207 -3.5165 0.00151 **
## hp -0.03115655 0.01437457 -2.1675 0.03886 *
## disp -0.00093701 0.01012138 -0.0926 0.92690
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_poly <-lm(mpg~ poly(wt, 2, raw= TRUE) +hp+ disp, data= dat1)
fit_spl <-lm(mpg~ bs(wt, df= 5) + hp+ disp,data= dat1)
AIC(fit1,fit_poly,fit_spl) %>% arrange(AIC)
## df AIC
## fit_poly 6 150.7305
## fit_spl 9 154.3323
## fit1 5 158.6430
BIC(fit1,fit_poly,fit_spl) %>% arrange(BIC)
## df BIC
## fit_poly 6 159.5250
## fit1 5 165.9717
## fit_spl 9 167.5239
#Kurva prediksi
grid_wt <-tibble(wt= seq(min(dat1$wt),max(dat1$wt), length.out= 200), hp= mean(dat1$hp), disp= mean(dat1$disp))
pred <-grid_wt %>%
mutate(OLS= predict(fit1, newdata= grid_wt),
Poly= predict(fit_poly, newdata= grid_wt),
Spline= predict(fit_spl, newdata= grid_wt))%>%
pivot_longer(OLS:Spline, names_to= "model",values_to= "yhat")
GG2 <-ggplot()+
geom_point(data= dat1,aes(wt,mpg), alpha= 0.3) +
geom_line(data= pred, aes(wt,yhat, linetype=model), linewidth= 0.9) +
labs(x= "wt", y= "mpg", title= "Perbandingan OLS vs Polinomial vs Spline")
GG2
k <-5
cv_res <-crossv_kfold(dat1,k= k)
rmse_fun <-function(train,test,formula){
m <-lm(formula,data= as.data.frame(train))
sqrt(mean((as.data.frame(test)$mpg-predict(m,newdata= as.data.frame(test)))^2))
}
forms<-list(
OLS = mpg~ wt+ hp+ disp,
Poly= mpg~ poly(wt, 2, raw= TRUE) + hp+ disp,
Spline= mpg ~ bs(wt, df= 5)+ hp+ disp
)
cv_tbl <-map_dfr(names(forms),function(nm){
fml<-forms[[nm]]
tibble(model= nm,fold = seq_len(k),
rmse= map2_dbl(cv_res$train,cv_res$test,~ rmse_fun(.x,.y,fml)))
})
cv_tbl %>%group_by(model) %>%summarise(mean_rmse= mean(rmse), sd_rmse= sd(rmse)) %>% arrange(mean_rmse)
## # A tibble: 3 × 3
## model mean_rmse sd_rmse
## <chr> <dbl> <dbl>
## 1 Poly 2.35 0.980
## 2 Spline 2.74 0.807
## 3 OLS 2.92 1.01
n <-400
x1<-runif(n, 0, 10)
x2<-rnorm(n, 5, 2)
sigma_i <-0.6 + 0.2* x1 #var naik dengan x1
err <-rnorm(n,0,sigma_i)
y <-2 + 1.5*x1-1.2*x2+ err
sim <-tibble(y,x1,x2, w= 1/sigma_i^2)
fit_ols <-lm(y~ x1+ x2, data= sim)
fit_wls <-lm(y~ x1+ x2, data= sim, weights= w)
#Tabel perbandingan SE
comp_se <-tibble(
term= names(coef(fit_ols)),
OLS_SE = coef(summary(fit_ols))[, "Std. Error"],
Robust_SE= sqrt(diag(vcovHC(fit_ols, type= "HC3"))),
WLS_SE = coef(summary(fit_wls))[, "Std. Error"]
)
comp_se
## # A tibble: 3 × 4
## term OLS_SE Robust_SE WLS_SE
## <chr> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.250 0.212 0.175
## 2 x1 0.0293 0.0282 0.0261
## 3 x2 0.0415 0.0399 0.0313
#Uji Breusch-Pagan untuk heteroskedastisitas
lmtest::bptest(fit_ols)
##
## studentized Breusch-Pagan test
##
## data: fit_ols
## BP = 29.415, df = 2, p-value = 4.098e-07
ggplot(data.frame(fitted=fitted(fit_ols),resid=resid(fit_ols)),aes(fitted,resid)) +
geom_point(alpha=.6) + geom_smooth(se=FALSE,method="loess")+
geom_hline(yintercept= 0,linetype=2) + labs(x="Fitted",y="Residuals")
set.seed(42)
T <- 400
x <- runif(T,-2, 2)
rho <- 0.6
u <- rnorm(T, 0, 1)
err <- numeric(T)
for (t in 2:T) err[t] <- rho*err[t-1] + u[t]
y <- 1 + 2*x + err
dar1 <- tibble::tibble(t = 1:T, x, y)
head(dar1)
## # A tibble: 6 × 3
## t x y
## <int> <dbl> <dbl>
## 1 1 1.66 4.32
## 2 2 1.75 4.83
## 3 3 -0.855 0.661
## 4 4 1.32 6.53
## 5 5 0.567 2.49
## 6 6 0.0764 0.213
library(nlme)
# OLS biasa
fit_ols_ar1 <- lm(y ~ x, data = dar1)
# GLS dengan korelasi AR(1) di residual
fit_gls_ar1 <- gls(y ~ x, data = dar1, correlation = corAR1(form = ~ t))
summary(fit_ols_ar1)
##
## Call:
## lm(formula = y ~ x, data = dar1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3360 -0.8231 0.0026 0.8211 3.9013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.95441 0.06043 15.79 <2e-16 ***
## x 2.06045 0.05162 39.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.209 on 398 degrees of freedom
## Multiple R-squared: 0.8002, Adjusted R-squared: 0.7997
## F-statistic: 1594 on 1 and 398 DF, p-value: < 2.2e-16
summary(fit_gls_ar1)
## Generalized least squares fit by REML
## Model: y ~ x
## Data: dar1
## AIC BIC logLik
## 1139.541 1155.487 -565.7706
##
## Correlation Structure: AR(1)
## Formula: ~t
## Parameter estimate(s):
## Phi
## 0.5772332
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.9552097 0.11651869 8.19791 0
## x 2.0449090 0.03743358 54.62766 0
##
## Correlation:
## (Intr)
## x 0.004
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.752477543 -0.674750151 -0.007333444 0.679546648 3.225627947
##
## Residual standard error: 1.210605
## Degrees of freedom: 400 total; 398 residual
par(mfrow = c(1,2))
acf(resid(fit_ols_ar1), main = "ACF Residuals (OLS)")
acf(resid(fit_gls_ar1, type = "normalized"), main = "ACF Residuals (GLS)")
par(mfrow = c(1,1))
lmtest::dwtest(fit_ols_ar1) # indikasi korelasi serial pada OLS
##
## Durbin-Watson test
##
## data: fit_ols_ar1
## DW = 0.84993, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Estimasi rho dari regresi resid_t ~ resid_{t-1}
res <- resid(fit_ols_ar1)
rho_hat <- coef(lm(res[-1] ~ 0 + res[-length(res)]))[1]
# Quasi-differencing (transformasi data)
y_star <- dar1$y[-1]- rho_hat*dar1$y[-T]
x_star <- dar1$x[-1]- rho_hat*dar1$x[-T]
fit_qd <- lm(y_star ~ x_star)
summary(fit_qd)
##
## Call:
## lm(formula = y_star ~ x_star)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6691 -0.6723 -0.0232 0.6970 3.2363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.40603 0.04955 8.194 3.52e-15 ***
## x_star 2.04499 0.03757 54.439 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9898 on 397 degrees of freedom
## Multiple R-squared: 0.8819, Adjusted R-squared: 0.8816
## F-statistic: 2964 on 1 and 397 DF, p-value: < 2.2e-16
Catatan: gls() memperkirakan 𝜌 dan ragam secara simultan dalam kerangka likelihood sehingga sering lebih stabil dibanding pendekatan dua tahap.
set.seed(123)
#Paket yang diperlukan
pkgs <-c("dplyr","ggplot2","car","lmtest","sandwich","MASS","boot","broom")
new <-pkgs[!(pkgs%in% installed.packages()[,"Package"])]
if(length(new))install.packages(new, dependencies= TRUE)
lapply(pkgs,library, character.only= TRUE)
## [[1]]
## [1] "nlme" "splines" "modelr" "car" "carData" "lmtest"
## [7] "zoo" "sandwich" "broom" "lubridate" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [25] "methods" "base"
##
## [[2]]
## [1] "nlme" "splines" "modelr" "car" "carData" "lmtest"
## [7] "zoo" "sandwich" "broom" "lubridate" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [25] "methods" "base"
##
## [[3]]
## [1] "nlme" "splines" "modelr" "car" "carData" "lmtest"
## [7] "zoo" "sandwich" "broom" "lubridate" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [25] "methods" "base"
##
## [[4]]
## [1] "nlme" "splines" "modelr" "car" "carData" "lmtest"
## [7] "zoo" "sandwich" "broom" "lubridate" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [25] "methods" "base"
##
## [[5]]
## [1] "nlme" "splines" "modelr" "car" "carData" "lmtest"
## [7] "zoo" "sandwich" "broom" "lubridate" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [25] "methods" "base"
##
## [[6]]
## [1] "MASS" "nlme" "splines" "modelr" "car" "carData"
## [7] "lmtest" "zoo" "sandwich" "broom" "lubridate" "forcats"
## [13] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
## [19] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
## [25] "datasets" "methods" "base"
##
## [[7]]
## [1] "boot" "MASS" "nlme" "splines" "modelr" "car"
## [7] "carData" "lmtest" "zoo" "sandwich" "broom" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [25] "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "boot" "MASS" "nlme" "splines" "modelr" "car"
## [7] "carData" "lmtest" "zoo" "sandwich" "broom" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [25] "utils" "datasets" "methods" "base"
1. Simulasi Data
Tujuan: membuat data dengan 3 prediktor (X1, X2, X3), disuntik multikolinearitas ringan (X1 & X2 berkorelasi) dan heteroskedastisitas (ragam error meningkat saat |X1| besar), serta sedikit outlier.
n <- 200
X1 <- rnorm(n, 0, 1)
# X2 berkorelasi ~0.7 dengan X1
rho <- 0.7
X2 <- rho*X1 + sqrt(1-rho^2)*rnorm(n)
X3 <- rnorm(n)
# Heteroskedastisitas: sd error tergantung |X1|
eps <- rnorm(n, sd = 0.5 + 0.5*abs(X1))
# Model benar (ground truth): y = 2 + 1.5*X1- 0.8*X2 + 0.5*X3 + eps
y <- 2 + 1.5*X1- 0.8*X2 + 0.5*X3 + eps
# Tambah beberapa outlier pada respon
idx_out <- sample(1:n, 3)
y[idx_out] <- y[idx_out] + rnorm(3, mean = 6, sd = 1)
dat <- data.frame(y, X1, X2, X3)
summary(dat)
## y X1 X2 X3
## Min. :-3.194 Min. :-2.30917 Min. :-2.36565 Min. :-2.80977
## 1st Qu.: 1.157 1st Qu.:-0.62576 1st Qu.:-0.68926 1st Qu.:-0.55753
## Median : 2.040 Median :-0.05874 Median : 0.03268 Median : 0.07583
## Mean : 2.066 Mean :-0.00857 Mean : 0.02408 Mean : 0.03178
## 3rd Qu.: 2.895 3rd Qu.: 0.56840 3rd Qu.: 0.69036 3rd Qu.: 0.68098
## Max. : 9.427 Max. : 3.24104 Max. : 3.00049 Max. : 2.43023
2. Estimasi Parameter (OLS)
mod0 <- lm(y ~ X1 + X2 + X3, data = dat)
summary(mod0)
##
## Call:
## lm(formula = y ~ X1 + X2 + X3, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3070 -0.6446 -0.0414 0.5284 7.1821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.07742 0.08726 23.806 < 2e-16 ***
## X1 1.31536 0.12466 10.551 < 2e-16 ***
## X2 -0.75833 0.12303 -6.164 3.97e-09 ***
## X3 0.57737 0.09070 6.366 1.35e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.232 on 196 degrees of freedom
## Multiple R-squared: 0.4404, Adjusted R-squared: 0.4318
## F-statistic: 51.41 on 3 and 196 DF, p-value: < 2.2e-16
broom::glance(mod0) # ringkas: R^2, Adj R^2, AIC, dll.
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.440 0.432 1.23 51.4 1.47e-24 3 -324. 657. 674.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Catatan interpretasi cepat: Koefisien menunjukkan perubahan rata-rata y untuk kenaikan 1 unit pada prediktor (dengan prediktor lain konstan). Lihat Pr(>|t|) untuk signifikansi.
3. Uji Hipotesis 3.1 Uji serentak (overall F-test) Sudah tersedia pada output summary(mod0) (Signifikansi model secara keseluruhan).
3.2 Uji parsial (t-test per koefisien) Juga ada di summary(mod0).
3.3 Uji hipotesis gabungan (contoh:
car::linearHypothesis(mod0, c("X2 = 0", "X3 = 0"))
##
## Linear hypothesis test:
## X2 = 0
## X3 = 0
##
## Model 1: restricted model
## Model 2: y ~ X1 + X2 + X3
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 423.38
## 2 196 297.59 2 125.79 41.423 9.897e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3.4 Uji dengan SE robust (menghadapi heteroskedastisitas)
# Koefisien + SE robust (HC3)
coeftest(mod0, vcov. = sandwich::vcovHC(mod0, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.077419 0.087811 23.6578 < 2.2e-16 ***
## X1 1.315357 0.163042 8.0676 7.027e-14 ***
## X2 -0.758334 0.123626 -6.1341 4.644e-09 ***
## X3 0.577370 0.107728 5.3595 2.330e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Uji Asumsi
4.1 Multikolinearitas
# Variance Inflation Factor (VIF)
car::vif(mod0)
## X1 X2 X3
## 1.811923 1.816606 1.003660
# Kondisi numerik (opsional): kappa > 30 indikasi masalah
kappa(model.matrix(mod0))
## [1] 2.263679
Rule of thumb: VIF > 10 (atau > 5) → indikasi kolinearitas meningkat.
4.2 Heteroskedastisitas
# Breusch-Pagan
lmtest::bptest(mod0)
##
## studentized Breusch-Pagan test
##
## data: mod0
## BP = 1.0283, df = 3, p-value = 0.7944
# White test (via bptest dengan kuadrat fitted)
lmtest::bptest(mod0, ~ fitted(mod0) + I(fitted(mod0)^2))
##
## studentized Breusch-Pagan test
##
## data: mod0
## BP = 5.3586, df = 2, p-value = 0.06861
Jika signifikan, gunakan SE robust (lihat 3.4) untuk inferensi yang lebih andal.
4.3 Normalitas residual
res <- rstandard(mod0)
shapiro.test(res) # sensitif untuk n besar; gunakan juga QQ-plot
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.87436, p-value = 7.729e-12
qqnorm(res); qqline(res)
4.4 Autokorelasi residual (opsional) (Biasanya untuk data runtun waktu)
lmtest::dwtest(mod0)
##
## Durbin-Watson test
##
## data: mod0
## DW = 2.0342, p-value = 0.5998
## alternative hypothesis: true autocorrelation is greater than 0
4.5 Uji spesifikasi (linearitas) – RamseyRESET
lmtest::resettest(mod0)
##
## RESET test
##
## data: mod0
## RESET = 4.2735, df1 = 2, df2 = 194, p-value = 0.01527
4.6 Plot Diagnostik Standar
par(mfrow=c(2,2))
plot(mod0)
par(mfrow=c(1,1))
5. Seleksi Model
Strategi: mulai dari model kandidat (termasuk transformasi sederhana) lalu gunakan AIC (stepwise), bandingkan dengan model awal,dan validasi via CV.
dat2 <-dat %>%
mutate(X1_sq= X1^2, X2_sq= X2^2,X3_sq= X3^2,
X1X2= X1*X2, X1X3= X1*X3,X2X3= X2*X3)
mod_full <-lm(y~X1 +X2 +X3+ X1_sq +X2_sq +X3_sq + X1X2+ X1X3 + X2X3,data= dat2)
AIC(mod0); AIC(mod_full)
## [1] 657.0574
## [1] 660.7653
#Stepwise berbasis AIC (dua arah)
mod_step <-MASS::stepAIC(mod_full,direction= "both", trace= FALSE)
summary(mod_step)
##
## Call:
## lm(formula = y ~ X1 + X2 + X3 + X2_sq + X1X2, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2286 -0.6853 -0.0298 0.4825 7.0242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.19439 0.10487 20.925 < 2e-16 ***
## X1 1.31032 0.12513 10.472 < 2e-16 ***
## X2 -0.72799 0.12278 -5.929 1.37e-08 ***
## X3 0.57331 0.08996 6.373 1.32e-09 ***
## X2_sq -0.28501 0.11895 -2.396 0.0175 *
## X1X2 0.23671 0.13546 1.748 0.0821 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.22 on 194 degrees of freedom
## Multiple R-squared: 0.4571, Adjusted R-squared: 0.4431
## F-statistic: 32.67 on 5 and 194 DF, p-value: < 2.2e-16
broom::glance(mod_step)[,c("r.squared","adj.r.squared","AIC","BIC")]
## # A tibble: 1 × 4
## r.squared adj.r.squared AIC BIC
## <dbl> <dbl> <dbl> <dbl>
## 1 0.457 0.443 655. 678.
Validasi silang (10-fold CV) untuk membandingkan RMSE:
# Fungsi CV RMSE menggunakan boot::cv.glm
cv_rmse <- function(model, data, K=10){
# cv.glm mengembalikan delta: [1] raw CV, [2] adjusted CV
set.seed(123)
res <- boot::cv.glm(data = data, glmfit = model, K = K)
sqrt(res$delta[1])
}
rmse_mod0 <- cv_rmse(mod0, dat)
rmse_modstep<- cv_rmse(mod_step, dat2)
c(RMSE_mod0 = rmse_mod0, RMSE_modStep = rmse_modstep)
## RMSE_mod0 RMSE_modStep
## NaN NaN
Pilih model dengan trade-off terbaik antara kesederhanaan dan kinerja (RMSE, AIC/BIC, Adj, R squared).
6. Evaluasi Outlier & Titik Berpengaruh
Gunakan studentized residuals, leverage (hat), dan Cook’s distance. Tambahkan Bonferroni outlier test.
infl <- influence.measures(mod0)
# Ringkasan pengaruh
summary(infl)
## Potentially influential observations of
## lm(formula = y ~ X1 + X2 + X3, data = dat) :
##
## dfb.1_ dfb.X1 dfb.X2 dfb.X3 dffit cov.r cook.d hat
## 16 -0.21 -0.06 -0.32 0.54 -0.78_* 0.94 0.15 0.07_*
## 18 0.30 -0.30 -0.24 0.09 0.74_* 0.75_* 0.13 0.03
## 27 0.00 0.00 0.00 0.01 0.01 1.07_* 0.00 0.04
## 56 -0.05 -0.05 0.01 0.11 -0.14 1.07_* 0.00 0.06
## 65 -0.14 0.36 -0.34 0.17 -0.45_* 0.97 0.05 0.04
## 90 0.31 0.25 0.04 -0.12 0.50_* 0.72_* 0.06 0.01
## 97 0.00 0.00 -0.01 0.00 -0.01 1.08_* 0.00 0.05
## 135 -0.03 0.07 -0.01 -0.08 -0.12 1.07_* 0.00 0.05
## 147 0.19 -0.21 -0.01 -0.25 0.42 0.92_* 0.04 0.03
## 149 0.23 0.54 -0.20 0.60 0.86_* 0.89_* 0.18 0.07_*
## 160 -0.02 0.04 -0.04 0.01 -0.05 1.06_* 0.00 0.04
## 162 0.43 -0.56 0.28 0.26 0.78_* 0.48_* 0.13 0.01
## 164 -0.20 -0.38 -0.25 -0.35 -0.87_* 0.95 0.18 0.08_*
## 191 0.03 0.02 -0.02 -0.08 0.08 1.07_* 0.00 0.05
# Statistik diagnostik
studres <- rstudent(mod0)
lev <- hatvalues(mod0)
cookd <- cooks.distance(mod0)
# Ambang sederhana
thr_res <- 3
# |studentized residual| > 3
thr_lev <- 2*length(coef(mod0))/nrow(dat) # leverage tinggi
thr_cook <- 4/nrow(dat) # Cook's distance besar
flag <- data.frame(
id = 1:nrow(dat),
studres = studres,
leverage = lev,
cookd = cookd
) %>%
mutate(
flag_res = abs(studres) > thr_res,
flag_lev = leverage > thr_lev,
flag_cook = cookd > thr_cook,
any_flag = flag_res | flag_lev | flag_cook
) %>%
arrange(desc(any_flag), desc(abs(studres)))
head(flag, 10)
## id studres leverage cookd flag_res flag_lev flag_cook any_flag
## 162 162 6.450812 0.01441863 0.12607179 TRUE FALSE TRUE TRUE
## 90 90 4.302679 0.01343365 0.05785187 TRUE FALSE TRUE TRUE
## 18 18 4.188058 0.03056365 0.12748735 TRUE FALSE TRUE TRUE
## 149 149 3.243249 0.06634585 0.17821039 TRUE TRUE TRUE TRUE
## 164 164 -2.855691 0.08454000 0.18164105 FALSE TRUE TRUE TRUE
## 16 16 -2.803489 0.07241620 0.14821120 FALSE TRUE TRUE TRUE
## 147 147 2.584024 0.02573283 0.04284913 FALSE FALSE TRUE TRUE
## 143 143 2.200166 0.03420638 0.04203837 FALSE FALSE TRUE TRUE
## 30 30 2.172800 0.03120105 0.03730329 FALSE FALSE TRUE TRUE
## 65 65 -2.140175 0.04228760 0.04965406 FALSE TRUE TRUE TRUE
Bonferroni outlier test (car):
car::outlierTest(mod0) #menguji outlier pada residual studentized dengan koreksi Bonferroni
## rstudent unadjusted p-value Bonferroni p
## 162 6.450812 8.5823e-10 1.7165e-07
## 90 4.302679 2.6674e-05 5.3348e-03
## 18 4.188058 4.2583e-05 8.5167e-03
Plot cepat untuk Cook’s distance:
plot(cookd, type="h", main="Cook's Distance", ylab="D", xlab="Observasi")
abline(h= thr_cook, lty=2)
Tindak lanjut (opsional):
Coba refit tanpa observasi berpengaruh ekstrem dan bandingkan koefisien/fit.
Pertimbangkan robust regression (M-estimator) jika banyak outlier:
MASS::rlm.
# Refit tanpa titik yang sangat berpengaruh (contoh)
to_drop <- which(cookd > thr_cook | abs(studres) > thr_res)
mod_refit <- lm(y ~ X1 + X2 + X3, data = dat[-to_drop, ])
broom::tidy(mod0)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.08 0.0873 23.8 9.73e-60
## 2 X1 1.32 0.125 10.6 6.73e-21
## 3 X2 -0.758 0.123 -6.16 3.97e- 9
## 4 X3 0.577 0.0907 6.37 1.35e- 9
broom::tidy(mod_refit)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.99 0.0566 35.1 2.69e-83
## 2 X1 1.44 0.0861 16.7 1.11e-38
## 3 X2 -0.676 0.0819 -8.26 2.87e-14
## 4 X3 0.501 0.0606 8.27 2.75e-14
Jawab:
Import Data
library(car)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
Eksplorasi Data
library(dplyr)
library(tidyr)
library(ggplot2)
# Buat data long hanya dengan variabel numerik
dat_long <- mtcars %>%
dplyr::select(-vs, -am) %>% # buang variabel kategorik
pivot_longer(-mpg, names_to = "variabel", values_to = "nilai")
# Plot
ggplot(dat_long, aes(x = nilai, y = mpg)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ variabel, scales = "free_x") +
labs(y = "Miles per Gallon (mpg)",
title = "Hubungan mpg dengan variabel numerik pada mtcars")
Interpretasi:
Terdapat hubungan negatif antara carb dengan mpg
Terdapat hubungan negatif antara cyl dengan mpg
Terdapat hubungan negatif antara disp dengan mpg
Terdapat hubungan positif antara drat dengan mpg
Terdapat hubungan positif antara gear dengan mpg
Terdapat hubungan negatif antara hp dengan mpg
Terdapat hubungan positif antara qsec dengan mpg
Terdapat hubungan negatif antara wt dengan mpg
# Variabel kategorik di mtcars
vars_kat <- c("vs", "am")
# Ubah ke long format
dat_kat <- mtcars %>%
pivot_longer(all_of(vars_kat), names_to = "variabel", values_to = "kategori")
# Boxplot untuk semua variabel kategorik
ggplot(dat_kat, aes(x = factor(kategori), y = mpg)) +
geom_boxplot(fill = "skyblue", alpha = 0.6) +
facet_wrap(~ variabel, scales = "free_x") +
labs(x = "Kategori", y = "Miles per Gallon (mpg)",
title = "Boxplot mpg terhadap variabel kategorik pada mtcars")
Interpretasi:
am = 1 cenderung memiliki mpg lebih tinggi dibandingkan am = 0.
carb = 1 dan carb = 2 cenderung memiliki mpg lebih tinggi dibandingkan kategori lainnya.
Semakin besar cyl, maka mpg semakin rendah.
gear = 4 cenderung memiliki mpg lebih tinggi dibanding dengan gear = 3 atau gear = 5.
vs = 1 memiliki mpg lebih tinggi dibandingkan vs = 0.
Estimasi OLS & Ringkasan
m1 <- lm(mpg ~ cyl+disp+hp+drat+wt+qsec+factor(vs)+factor(am)+gear+carb, data = mtcars)
summary(m1)
##
## Call:
## lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + factor(vs) +
## factor(am) + gear + carb, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## factor(vs)1 0.31776 2.10451 0.151 0.8814
## factor(am)1 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
tidy(m1,conf.int= TRUE)
## # A tibble: 11 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 12.3 18.7 0.657 0.518 -26.6 51.2
## 2 cyl -0.111 1.05 -0.107 0.916 -2.28 2.06
## 3 disp 0.0133 0.0179 0.747 0.463 -0.0238 0.0505
## 4 hp -0.0215 0.0218 -0.987 0.335 -0.0668 0.0238
## 5 drat 0.787 1.64 0.481 0.635 -2.61 4.19
## 6 wt -3.72 1.89 -1.96 0.0633 -7.65 0.224
## 7 qsec 0.821 0.731 1.12 0.274 -0.699 2.34
## 8 factor(vs)1 0.318 2.10 0.151 0.881 -4.06 4.69
## 9 factor(am)1 2.52 2.06 1.23 0.234 -1.76 6.80
## 10 gear 0.655 1.49 0.439 0.665 -2.45 3.76
## 11 carb -0.199 0.829 -0.241 0.812 -1.92 1.52
1.1. heteroskedastisitas (Breusch–Pagan)
H0 : tidak ada heteroskedastisitas (varian residual konstan/homoskedastis)
H1 : ada heteroskedastisitas (varian residual tidak konstan)
bp <- bptest(m1)
bp
##
## studentized Breusch-Pagan test
##
## data: m1
## BP = 14.914, df = 10, p-value = 0.1352
## Karena p-value: 0.135 lebih dari atau sama dengan 0.05. Maka H0 tidak ditolak, sehingga tidak ada heteroskedastisitas
1.2. Multikolinearitas (VIF)
vif_val <- vif(m1)
vif_tbl <- tibble(
variable = names(vif_val),
VIF = as.numeric(vif_val)
) %>%
arrange(desc(VIF))
vif_tbl
## # A tibble: 10 × 2
## variable VIF
## <chr> <dbl>
## 1 disp 21.6
## 2 cyl 15.4
## 3 wt 15.2
## 4 hp 9.83
## 5 carb 7.91
## 6 qsec 7.53
## 7 gear 5.36
## 8 factor(vs) 4.97
## 9 factor(am) 4.65
## 10 drat 3.37
Interpretasi: Nilai VIF besar (≳ 10) mengindikasikan
multikolinearitas kuat. Variabel disp, cyl,
wt mempunyai VIF > 10. Sehingga mengindikasikan
multikolinearitas yang kuat. Variabel disp akan
dihapus.
m2 <- lm(mpg ~ . -disp, data = mtcars)
vif_val2 <- vif(m2)
vif_tbl2 <- tibble(
variable = names(vif_val2),
VIF = as.numeric(vif_val2)
) %>%
arrange(desc(VIF))
vif_tbl2
## # A tibble: 9 × 2
## variable VIF
## <chr> <dbl>
## 1 cyl 14.3
## 2 hp 7.12
## 3 qsec 6.91
## 4 wt 6.19
## 5 gear 5.32
## 6 vs 4.92
## 7 am 4.65
## 8 carb 4.31
## 9 drat 3.33
Interpretasi: Setelah disp dihapus,
variabel cyl mempunyai VIF > 10. Sehingga
mengindikasikan multikolinearitas yang kuat. Variabel cyl
akan dihapus.
m3 <- lm(mpg ~ . -disp -cyl, data = mtcars)
vif_val3 <- vif(m3)
vif_tbl3 <- tibble(
variable = names(vif_val3),
VIF = as.numeric(vif_val3)
) %>%
arrange(desc(VIF))
vif_tbl3
## # A tibble: 8 × 2
## variable VIF
## <chr> <dbl>
## 1 wt 6.05
## 2 hp 6.02
## 3 qsec 5.92
## 4 gear 4.69
## 5 carb 4.29
## 6 am 4.29
## 7 vs 4.27
## 8 drat 3.11
Interpretasi: Setelah cyl dihapus,
semua variabel memiliki VIF < 10. Sehingga model terbebas dari
multikolinearitas
1.3. Pengaruh (Cook’s D)
cookd <- cooks.distance(m3)
thr_cook <- 4/nrow(mtcars)
flag <- data.frame(
id = 1:nrow(mtcars),
car=mtcars$car,
cookd = cookd
) %>%
mutate(
flag_cook = cookd > thr_cook,
) %>%
arrange(desc(cookd))
head(flag, 10)
## id car cookd flag_cook
## Ford Pantera L 29 4 0.35091992 TRUE
## Merc 230 9 2 0.33030351 TRUE
## Lotus Europa 28 2 0.16327114 TRUE
## Chrysler Imperial 17 4 0.14434739 TRUE
## Toyota Corolla 20 1 0.10934157 FALSE
## Maserati Bora 31 8 0.10783468 FALSE
## Datsun 710 3 1 0.07329257 FALSE
## Fiat 128 18 1 0.07048882 FALSE
## Volvo 142E 32 2 0.06851912 FALSE
## Valiant 6 1 0.04680887 FALSE
plot(cookd, type="h", main="Cook's Distance", ylab="D", xlab="Observasi")
abline(h= thr_cook, lty=2, col="red")
Interpretasi: Ford Pantera L, Merc 230, Lotus Europa,
Chrysler Imperial memiliki Cook’s Distance tinggi, sehingga berpengaruh
terhadap model regresi. Titik berpengaruh ini perlu diperiksa lebih
lanjut atau ditangani dengan pendekatan robust regression bila
diperlukan.
# Model tanpa titik berpengaruh
mtcars_no_out <- mtcars[-c(9, 17, 28, 29), ]
m4 <- lm(mpg ~ . -disp -cyl, data = mtcars_no_out)
summary(m4)
##
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars_no_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4398 -1.5273 -0.0027 1.1378 4.2436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.25532 21.70599 -0.841 0.4108
## hp 0.01297 0.01969 0.659 0.5179
## drat 2.40851 1.65495 1.455 0.1619
## wt -3.60201 1.28552 -2.802 0.0114 *
## qsec 2.03098 1.03338 1.965 0.0642 .
## vs -3.14638 2.40718 -1.307 0.2068
## am 1.10451 1.85774 0.595 0.5592
## gear 1.74121 1.57344 1.107 0.2823
## carb -0.81296 0.69539 -1.169 0.2568
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.299 on 19 degrees of freedom
## Multiple R-squared: 0.8959, Adjusted R-squared: 0.852
## F-statistic: 20.43 on 8 and 19 DF, p-value: 8.687e-08
# Bandingkan koefisien
coef(m3)
## (Intercept) hp drat wt qsec vs
## 13.80810376 -0.01225158 0.88893522 -2.60967758 0.63983256 0.08786399
## am gear carb
## 2.42417670 0.69389707 -0.61286048
coef(m4)
## (Intercept) hp drat wt qsec vs
## -18.25531997 0.01297237 2.40850571 -3.60200564 2.03098426 -3.14638244
## am gear carb
## 1.10450897 1.74121199 -0.81295692
Interpretasi: Karena koefisien berubah drastis (arah hubungan berubah dari positif ke negatif atau sebaliknya), sehingga model sangat dipengaruhi titik tersebut. Dapat diperiksa apakah input data tersebut benar/tidak lalu dapat dipertimbangkan menggunakan robust regression.
1.4. Normalitas (QQ-plot)
plot(m3, which=2, id.n = 0)
Interpretasi: QQ plot menunjukkan bahwa sebagian besar residual mengikuti garis diagonal, sehingga asumsi normalitas residual terpenuhi.
summary(m3)
##
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -1.3903 -0.3045 1.2269 4.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.80810 12.88582 1.072 0.2950
## hp -0.01225 0.01649 -0.743 0.4650
## drat 0.88894 1.52061 0.585 0.5645
## wt -2.60968 1.15878 -2.252 0.0342 *
## qsec 0.63983 0.62752 1.020 0.3185
## vs 0.08786 1.88992 0.046 0.9633
## am 2.42418 1.91227 1.268 0.2176
## gear 0.69390 1.35294 0.513 0.6129
## carb -0.61286 0.59109 -1.037 0.3106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8187
## F-statistic: 18.5 on 8 and 23 DF, p-value: 2.627e-08
library(dplyr)
library(sandwich)
library(lmtest)
set.seed(123)
n <- 400
x1 <- runif(n, 0, 10)
x2 <- rnorm(n, 5, 2)
sigma_i <- 0.6 + 0.2 * x1 # var naik dengan x1
err <- rnorm(n, 0, sigma_i)
y <- 2 + 1.5*x1 - 1.2*x2 + err
sim <- tibble(y, x1, x2, w = 1/sigma_i^2)
fit_ols <- lm(y ~ x1 + x2, data = sim)
fit_wls <- lm(y ~ x1 + x2, data = sim, weights = w)
# --- perbandingan standar error & p-value
comp <- tibble(
term = names(coef(fit_ols)),
OLS_SE = coef(summary(fit_ols))[, "Std. Error"],
Robust_SE = sqrt(diag(vcovHC(fit_ols, type = "HC3"))),
WLS_SE = coef(summary(fit_wls))[, "Std. Error"],
OLS_p = coef(summary(fit_ols))[, "Pr(>|t|)"],
Robust_p = coeftest(fit_ols, vcovHC(fit_ols, type = "HC3"))[, 4],
WLS_p = coef(summary(fit_wls))[, "Pr(>|t|)"]
)
comp
## # A tibble: 3 × 7
## term OLS_SE Robust_SE WLS_SE OLS_p Robust_p WLS_p
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.281 0.267 0.191 5.07e- 11 5.65e- 12 4.04e- 18
## 2 x1 0.0300 0.0313 0.0267 1.13e-175 1.12e-169 3.75e-194
## 3 x2 0.0435 0.0466 0.0333 7.83e- 94 2.83e- 86 3.64e-124
Interpretasi: Semua variabel signifikan (p-value <0.05). SE WLS paling kecil sehingga WLS memberikan estimasi terbaik.
dat1 <- mtcars %>%
as_tibble(rownames = "car") %>%
mutate(across(c(mpg, disp, hp, wt, qsec), as.double))
AIC
fit_poly <-lm(mpg~ poly(wt, 2, raw= TRUE) +hp+ disp, data= dat1)
fit_spl <-lm(mpg~ bs(wt, df= 5) + hp+ disp,data= dat1)
AIC(fit_poly,fit_spl) %>% arrange(AIC)
## df AIC
## fit_poly 6 150.7305
## fit_spl 9 154.3323
BIC
BIC(fit_poly,fit_spl) %>% arrange(BIC)
## df BIC
## fit_poly 6 159.5250
## fit_spl 9 167.5239
set.seed(123)
k <-5
cv_res <-crossv_kfold(dat1,k= k)
rmse_fun <-function(train,test,formula){
m <-lm(formula,data= as.data.frame(train))
sqrt(mean((as.data.frame(test)$mpg-predict(m,newdata=as.data.frame(test)))^2))
}
forms<-list(
Poly= mpg~ poly(wt, 2, raw= TRUE) + hp+ disp,
Spline= mpg ~ bs(wt, df= 5)+ hp+ disp
)
cv_tbl <-map_dfr(names(forms),function(nm){
fml<-forms[[nm]]
tibble(model= nm,fold = seq_len(k),
rmse= map2_dbl(cv_res$train,cv_res$test,~ rmse_fun(.x,.y,fml)))
})
cv_tbl %>%
group_by(model) %>%
summarise(mean_rmse= mean(rmse), sd_rmse= sd(rmse)) %>%
arrange(mean_rmse)
## # A tibble: 2 × 3
## model mean_rmse sd_rmse
## <chr> <dbl> <dbl>
## 1 Poly 2.35 0.980
## 2 Spline 2.74 0.807
library(dplyr)
library(tidyr)
# AIC & BIC
ic_tbl <- tibble(
Model = c("Poly", "Spline"),
AIC = c(AIC(fit_poly), AIC(fit_spl)),
BIC = c(BIC(fit_poly), BIC(fit_spl))
)
# CV RMSE
cv_summary <- cv_tbl %>%
group_by(model) %>%
summarise(
mean_RMSE = mean(rmse),
sd_RMSE = sd(rmse),
.groups = "drop"
) %>%
rename(Model = model)
# Gabungkan semua
final_tbl <- ic_tbl %>%
left_join(cv_summary, by = "Model")
final_tbl
## # A tibble: 2 × 5
## Model AIC BIC mean_RMSE sd_RMSE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Poly 151. 160. 2.35 0.980
## 2 Spline 154. 168. 2.74 0.807
Interpretasi: Model Polynomial lebih baik daripada Spline, karena memiliki AIC dan BIC lebih kecil serta mean RMSE cross-validation lebih rendah.
library(tibble)
set.seed(42)
T <- 400
t <- 1:T
rho <- 0.6
u <- rnorm(T, 0, 1)
err <- numeric(T)
for (i in 2:T) err[i] <- rho * err[i-1] + u[i]
# --- x jadi sinyal musiman
x <- sin(2*pi*t/12)
y <- 1 + 2*x + err
dar1 <- tibble(t = t, x = x, y = y)
head(dar1)
## # A tibble: 6 × 3
## t x y
## <int> <dbl> <dbl>
## 1 1 5 e- 1 2
## 2 2 8.66e- 1 2.17
## 3 3 1 e+ 0 3.02
## 4 4 8.66e- 1 3.38
## 5 5 5 e- 1 2.79
## 6 6 1.22e-16 1.37
OLS vs GLS
library(nlme)
# OLS biasa
fit_ols_ar1 <- lm(y ~ x, data = dar1)
# GLS dengan korelasi AR(1) di residual
fit_gls_ar1 <- gls(y ~ x, data = dar1,
correlation = corAR1(form = ~ t))
summary(fit_ols_ar1)
##
## Call:
## lm(formula = y ~ x, data = dar1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0510 -0.7985 0.0059 0.7792 3.6370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.96844 0.05664 17.1 <2e-16 ***
## x 1.83909 0.07995 23.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.133 on 398 degrees of freedom
## Multiple R-squared: 0.5707, Adjusted R-squared: 0.5697
## F-statistic: 529.2 on 1 and 398 DF, p-value: < 2.2e-16
summary(fit_gls_ar1)
## Generalized least squares fit by REML
## Model: y ~ x
## Data: dar1
## AIC BIC logLik
## 1109.051 1124.997 -550.5254
##
## Correlation Structure: AR(1)
## Formula: ~t
## Parameter estimate(s):
## Phi
## 0.5440471
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.9724752 0.1043172 9.322293 0
## x 1.8390741 0.1132805 16.234685 0
##
## Correlation:
## (Intr)
## x -0.013
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -3.566387570 -0.705789893 0.001620549 0.681785092 3.195184639
##
## Residual standard error: 1.137024
## Degrees of freedom: 400 total; 398 residual
se_ols <- coef(summary(fit_ols_ar1))[, "Std. Error"]
se_gls <- summary(fit_gls_ar1)$tTable[, "Std.Error"]
cbind(SE_OLS = se_ols, SE_GLS = se_gls)
## SE_OLS SE_GLS
## (Intercept) 0.05663725 0.1043172
## x 0.07994741 0.1132805
Nilai SE pada model GLS lebih besar dibandingkan dengan model OLS. Artinya model GLS lebih valid karena memperhitungkan struktur autokorelasi di error. Sehingga, SE mencerminkan ketidakpastian yang sebenarnya ada.
Diagnostik Serial Correlation
par(mfrow = c(1,2))
acf(resid(fit_ols_ar1), main = "ACF Residuals (OLS)")
acf(resid(fit_gls_ar1, type = "normalized"), main = "ACF Residuals (GLS)")
- OLS : Residual OLS menunjukkan autokorelasi positif yang signifikan
pada lag-lag awal (melewati garis biru).
par(mfrow = c(1,1))
H0 : Tidak ada autokorelasi residual
H1 : Ada autokorelasi residual
dw=lmtest::dwtest(fit_ols_ar1)
dw
##
## Durbin-Watson test
##
## data: fit_ols_ar1
## DW = 0.92047, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
dw$p.value
## [1] 8.988689e-28
# Estimasi rho
rho_hat <- coef(fit_gls_ar1$modelStruct$corStruct, unconstrained = FALSE)
cat("Estimated rho =", rho_hat, "\n")
## Estimated rho = 0.5440471
ρ^ = 0.544 artinya residual saat ini masih berkorelasi positif ~54% dengan residual sebelumnya.
set.seed(42)
library(ggplot2)
library(dplyr)
library(tidyr)
# Parameter
K <- 1.0 # kapasitas maksimum (misal proporsi)
alpha <- -3.0 # intercept logit
beta <- 0.9 # laju pertumbuhan
sigma <- 0.03 # noise
# Data
t <- seq(0, 8, length.out = 120)
mu <- K/(1 + exp(-(alpha + beta*t)))
y <- mu + rnorm(length(t), sd = sigma)
logistic_df <- tibble(t= t, y= y, mu = mu)
# Plot
ggplot(logistic_df, aes(t, y)) +
geom_point(alpha = 0.6) +
geom_line(aes(y = mu), linewidth = 1) +
labs(title = "Pertumbuhan Bakteri (Kurva Logistik)",
x= "Waktu (t)", y= "Ukuran/Proporsi Populasi") +
theme_minimal()
# Parameter
Vmax <- 2.0
Km <- 1.2
sigma <- 0.06
# Data
X <- seq(0, 6, length.out = 80)
mu <- (Vmax * X) / (Km + X)
Y <- mu + rnorm(length(X), sd = sigma)
mm_df <- tibble(X = X, Y = Y, mu = mu)
ggplot(mm_df, aes(X, Y)) +
geom_point(alpha = 0.6) +
geom_line(aes(y = mu), linewidth = 1) +
labs(title = "Respon Obat (Michaelis–Menten)",
x= "Konsentrasi Obat (X)", y= "Respon (Y)") +
theme_minimal()
# Parameter
A <- 5
alpha <- 0.5 # < 1 → diminishing marginal return
sigma <- 0.3
# Data
X <- seq(0, 100, length.out = 120)
mu <- A * X^alpha
Y <- mu + rnorm(length(X), sd = sigma)
dr_df <- tibble(X = X, Y = Y, mu = mu)
ggplot(dr_df, aes(X, Y)) +
geom_point(alpha = 0.6) +
geom_line(aes(y = mu), linewidth = 1) +
labs(title = "Diminishing Return: Y = A * X^{alpha}, 0<alpha<1",
x= "Input/Modal (X)", y= "Output (Y)") +
theme_minimal()
set.seed(123)
library(ggplot2)
library(dplyr)
library(tidyr)
beta0_true <- 2.0
beta1_true <- 0.3
sigma_true <- 0.4
# Small dataset agar mudah ditelusuri langkah demi langkah
x <- c(0, 1, 2, 3, 4, 5)
f_true <- beta0_true * exp(beta1_true * x)
y <- f_true + rnorm(length(x), sd = sigma_true)
dat <- tibble(x, y, f_true)
dat
## # A tibble: 6 × 3
## x y f_true
## <dbl> <dbl> <dbl>
## 1 0 1.78 2
## 2 1 2.61 2.70
## 3 2 4.27 3.64
## 4 3 4.95 4.92
## 5 4 6.69 6.64
## 6 5 9.65 8.96
ggplot(dat, aes(x, y)) +
geom_point(size=2) +
geom_line(aes(y = f_true), linewidth = 0.9) +
labs(title="Data Eksponensial (dengan noise) & Kurva Sejati",
x="X", y="Y") +
theme_minimal()
f_exp <- function(x, theta){ # theta = c(beta0, beta1)
beta0 <- theta[1]; beta1 <- theta[2]
beta0 * exp(beta1 * x)
}
resid_vec <- function(x, y, theta){
y- f_exp(x, theta)
}
J_exp <- function(x, theta){
beta0 <- theta[1]; beta1 <- theta[2]
e <- exp(beta1 * x)
cbind( d_beta0 = e,
d_beta1 = beta0 * x * e)
}
gn_step <- function(x, y, theta){
r <- resid_vec(x, y, theta)
J <- J_exp(x, theta)
# Normal equations
lhs <- crossprod(J, J)
rhs <- crossprod(J, r)
# J^T J
# J^T r
Delta <- solve(lhs, rhs) # langkah GN
list(Delta = as.numeric(Delta),
theta_new = theta + as.numeric(Delta),
r= r, J= J, lhs = lhs, rhs = rhs,
SSE = sum(r^2))
}
theta0 <- c(beta0 = 1.0, beta1 = 0.0) # tebakan awal sengaja "kasar"
step1 <- gn_step(x, y, theta0)
step1
## $Delta
## [1] 0.2542352 1.4943019
##
## $theta_new
## beta0 beta1
## 1.254235 1.494302
##
## $r
## [1] 0.7758097 1.6076466 3.2677209 3.9474096 5.6919489 8.6494041
##
## $J
## d_beta0 d_beta1
## [1,] 1 0
## [2,] 1 1
## [3,] 1 2
## [4,] 1 3
## [5,] 1 4
## [6,] 1 5
##
## $lhs
## d_beta0 d_beta1
## d_beta0 6 15
## d_beta1 15 55
##
## $rhs
## [,1]
## d_beta0 23.93994
## d_beta1 86.00013
##
## $SSE
## [1] 136.6569