1 Model Regresi

Model persamaan untuk setiap pengamatan \(i=1,\ldots,n\):

\[ Y_i = \beta_0 + \beta_1 X_{i} + \varepsilon_i, \qquad i=1,\ldots,n \] dengan

  • \(Y_i\) : variabel dependen (respon)
  • \(X_i\) : variabel independen (prediktor)
  • \(\beta_0, \beta_1\) : parameter regresi
  • \(\varepsilon_i\) : error (residu)

2 Latihan 1

Data Gunakan dataset mtcars untuk melihat hubungan mpg (miles per gallon) dengan wt (berat mobil).

data(mtcars)
head(mtcars[, c("mpg", "wt")])
##                    mpg    wt
## Mazda RX4         21.0 2.620
## Mazda RX4 Wag     21.0 2.875
## Datsun 710        22.8 2.320
## Hornet 4 Drive    21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant           18.1 3.460

Korelasi

cor.test(mtcars$wt, mtcars$mpg)
## 
##  Pearson's product-moment correlation
## 
## data:  mtcars$wt and mtcars$mpg
## t = -9.559, df = 30, p-value = 1.294e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9338264 -0.7440872
## sample estimates:
##        cor 
## -0.8676594

Regresi

model <- lm(mpg ~ wt, data = mtcars)
summary(model)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

Visualisasi

library(ggplot2)

ggplot(mtcars, aes(x = wt, y= mpg)) +
 geom_point() +
 geom_smooth(method = "lm", se = TRUE, color = "blue") +
 labs(title = "Hubungan antara Berat Mobil (wt) dan Efisiensi Bahan Bakar (mpg)",
 x= "Berat Mobil (1000 lbs)",
 y= "Miles per Gallon (mpg)")

2.1 Studi Kasus 1: Regresi pada Data Nyata (mtcars)

2.1.1 Persiapan Data dan Eksplorasi

dat1 <- mtcars %>%
 as_tibble(rownames = "car") %>%
 mutate(across(c(mpg, disp, hp, wt, qsec), as.double))

# Plot hubungan awal
GG1 <- ggplot(dat1, aes(wt, mpg)) +
 geom_point(alpha = 0.7) +
 geom_smooth(method = "lm", se = FALSE) +
 labs(x= "Weight (1000 lbs)", y= "Miles per Gallon (mpg)",
 title = "Hubungan mpg vs wt pada mtcars")

GG1

2.1.2 Estimasi OLS & Ringkasan

fit1 <- lm(mpg ~ wt + hp + disp, data = dat1)
summary(fit1)
## 
## Call:
## lm(formula = mpg ~ wt + hp + disp, data = dat1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.891 -1.640 -0.172  1.061  5.861 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.105505   2.110815  17.579  < 2e-16 ***
## wt          -3.800891   1.066191  -3.565  0.00133 ** 
## hp          -0.031157   0.011436  -2.724  0.01097 *  
## disp        -0.000937   0.010350  -0.091  0.92851    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared:  0.8268, Adjusted R-squared:  0.8083 
## F-statistic: 44.57 on 3 and 28 DF,  p-value: 8.65e-11
#Koefisien dengan CI 95%
 tidy(fit1,conf.int= TRUE)
## # A tibble: 4 × 7
##   term         estimate std.error statistic  p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept) 37.1         2.11     17.6    1.16e-16  32.8     41.4    
## 2 wt          -3.80        1.07     -3.56   1.33e- 3  -5.98    -1.62   
## 3 hp          -0.0312      0.0114   -2.72   1.10e- 2  -0.0546  -0.00773
## 4 disp        -0.000937    0.0103   -0.0905 9.29e- 1  -0.0221   0.0203

2.1.3 Diagnostik Dasar(Residu, QQ-plot, Cook)

diag_df <-tibble(
 fitted= fitted(fit1), resid= resid(fit1),
 stdres= rstandard(fit1), hat= hatvalues(fit1), cook= cooks.distance(fit1)
)

p1<-ggplot(diag_df, aes(fitted,resid)) +
 geom_point(alpha= 0.7) +
 geom_hline(yintercept= 0,linetype= 2) +
 labs(x= "Fitted", y = "Residuals")

p2<-ggplot(diag_df, aes(sample= stdres))+
 stat_qq()+ stat_qq_line()+ labs(x= "Theoretical", y= "StandardizedResiduals")

p1;p2

qplot(seq_along(diag_df$cook),diag_df$cook, geom= c("line","point"), xlab= "Index", ylab= "Cook'sD")

2.1.4 Multikolinearitas(VIF)

car::vif(fit1)
##       wt       hp     disp 
## 4.844618 2.736633 7.324517

2.1.5 Robust SE (White HC3) & Interpretasi

coeftest(fit1)
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) 37.10550527  2.11081525 17.5788 < 2.2e-16 ***
## wt          -3.80089058  1.06619064 -3.5649  0.001331 ** 
## hp          -0.03115655  0.01143579 -2.7245  0.010971 *  
## disp        -0.00093701  0.01034974 -0.0905  0.928507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit1, vcov= vcovHC(fit1, type= "HC3")) # SE robust (HC3)
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) 37.10550527  2.71026981 13.6907 6.257e-14 ***
## wt          -3.80089058  1.08086207 -3.5165   0.00151 ** 
## hp          -0.03115655  0.01437457 -2.1675   0.03886 *  
## disp        -0.00093701  0.01012138 -0.0926   0.92690    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.1.6 Nonlinearitas: Polinomial vs Spline pada wt

fit_poly <-lm(mpg~ poly(wt, 2, raw= TRUE) +hp+ disp, data= dat1)
fit_spl <-lm(mpg~ bs(wt, df= 5) + hp+ disp,data= dat1)
AIC(fit1,fit_poly,fit_spl) %>% arrange(AIC)
##          df      AIC
## fit_poly  6 150.7305
## fit_spl   9 154.3323
## fit1      5 158.6430
BIC(fit1,fit_poly,fit_spl) %>% arrange(BIC)
##          df      BIC
## fit_poly  6 159.5250
## fit1      5 165.9717
## fit_spl   9 167.5239
#Kurva prediksi
grid_wt <-tibble(wt= seq(min(dat1$wt),max(dat1$wt), length.out= 200), hp= mean(dat1$hp), disp= mean(dat1$disp))
pred <-grid_wt %>%
  mutate(OLS= predict(fit1, newdata= grid_wt),
         Poly= predict(fit_poly, newdata= grid_wt),
         Spline= predict(fit_spl, newdata= grid_wt))%>%
  pivot_longer(OLS:Spline, names_to= "model",values_to= "yhat")

GG2 <-ggplot()+
  geom_point(data= dat1,aes(wt,mpg), alpha= 0.3) +
  geom_line(data= pred, aes(wt,yhat, linetype=model), linewidth= 0.9) +
  labs(x= "wt", y= "mpg", title= "Perbandingan OLS vs Polinomial vs Spline")
 GG2

2.1.7 Validasi Silang (K-fold) untuk Perbandingan Model

k <-5
cv_res <-crossv_kfold(dat1,k= k)

rmse_fun <-function(train,test,formula){
  m <-lm(formula,data= as.data.frame(train))
  sqrt(mean((as.data.frame(test)$mpg-predict(m,newdata= as.data.frame(test)))^2))
}

forms<-list(
  OLS = mpg~ wt+ hp+ disp,
  Poly= mpg~ poly(wt, 2, raw= TRUE) + hp+ disp,
  Spline= mpg ~ bs(wt, df= 5)+ hp+ disp
)

cv_tbl <-map_dfr(names(forms),function(nm){
 fml<-forms[[nm]]
 tibble(model= nm,fold = seq_len(k),
 rmse= map2_dbl(cv_res$train,cv_res$test,~ rmse_fun(.x,.y,fml)))
 })
 cv_tbl %>%group_by(model) %>%summarise(mean_rmse= mean(rmse), sd_rmse= sd(rmse)) %>% arrange(mean_rmse)
## # A tibble: 3 × 3
##   model  mean_rmse sd_rmse
##   <chr>      <dbl>   <dbl>
## 1 Poly        2.35   0.980
## 2 Spline      2.74   0.807
## 3 OLS         2.92   1.01

2.2 Studi Kasus 2: Simulasi Heteroskedastisitas (WLS vs OLS vs Robust)

n <-400
x1<-runif(n, 0, 10)
x2<-rnorm(n, 5, 2)
sigma_i <-0.6 + 0.2* x1 #var naik dengan x1
err <-rnorm(n,0,sigma_i)
y <-2 + 1.5*x1-1.2*x2+ err
sim <-tibble(y,x1,x2, w= 1/sigma_i^2)

fit_ols <-lm(y~ x1+ x2, data= sim)
fit_wls <-lm(y~ x1+ x2, data= sim, weights= w)

#Tabel perbandingan SE
comp_se <-tibble(
 term= names(coef(fit_ols)),
 OLS_SE = coef(summary(fit_ols))[, "Std. Error"],
 Robust_SE= sqrt(diag(vcovHC(fit_ols, type= "HC3"))),
 WLS_SE = coef(summary(fit_wls))[, "Std. Error"]
)
comp_se
## # A tibble: 3 × 4
##   term        OLS_SE Robust_SE WLS_SE
##   <chr>        <dbl>     <dbl>  <dbl>
## 1 (Intercept) 0.250     0.212  0.175 
## 2 x1          0.0293    0.0282 0.0261
## 3 x2          0.0415    0.0399 0.0313
#Uji Breusch-Pagan untuk heteroskedastisitas
lmtest::bptest(fit_ols)
## 
##  studentized Breusch-Pagan test
## 
## data:  fit_ols
## BP = 29.415, df = 2, p-value = 4.098e-07
ggplot(data.frame(fitted=fitted(fit_ols),resid=resid(fit_ols)),aes(fitted,resid)) +
  geom_point(alpha=.6) + geom_smooth(se=FALSE,method="loess")+
  geom_hline(yintercept= 0,linetype=2) + labs(x="Fitted",y="Residuals")

2.3 GLS dengan Korelasi AR(1): Contoh dan Simulasi

2.3.1 Simulasi data dengan galat AR(1)

set.seed(42)
T <- 400
x <- runif(T,-2, 2)
rho <- 0.6
u <- rnorm(T, 0, 1)
err <- numeric(T)
for (t in 2:T) err[t] <- rho*err[t-1] + u[t]
y <- 1 + 2*x + err

dar1 <- tibble::tibble(t = 1:T, x, y)
head(dar1)
## # A tibble: 6 × 3
##       t       x     y
##   <int>   <dbl> <dbl>
## 1     1  1.66   4.32 
## 2     2  1.75   4.83 
## 3     3 -0.855  0.661
## 4     4  1.32   6.53 
## 5     5  0.567  2.49 
## 6     6  0.0764 0.213

2.3.2 OLS vs GLS (nlme::gls dengan corAR1)

library(nlme)
# OLS biasa
fit_ols_ar1 <- lm(y ~ x, data = dar1)

# GLS dengan korelasi AR(1) di residual
fit_gls_ar1 <- gls(y ~ x, data = dar1, correlation = corAR1(form = ~ t))
summary(fit_ols_ar1)
## 
## Call:
## lm(formula = y ~ x, data = dar1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3360 -0.8231  0.0026  0.8211  3.9013 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.95441    0.06043   15.79   <2e-16 ***
## x            2.06045    0.05162   39.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.209 on 398 degrees of freedom
## Multiple R-squared:  0.8002, Adjusted R-squared:  0.7997 
## F-statistic:  1594 on 1 and 398 DF,  p-value: < 2.2e-16
summary(fit_gls_ar1)
## Generalized least squares fit by REML
##   Model: y ~ x 
##   Data: dar1 
##        AIC      BIC    logLik
##   1139.541 1155.487 -565.7706
## 
## Correlation Structure: AR(1)
##  Formula: ~t 
##  Parameter estimate(s):
##       Phi 
## 0.5772332 
## 
## Coefficients:
##                 Value  Std.Error  t-value p-value
## (Intercept) 0.9552097 0.11651869  8.19791       0
## x           2.0449090 0.03743358 54.62766       0
## 
##  Correlation: 
##   (Intr)
## x 0.004 
## 
## Standardized residuals:
##          Min           Q1          Med           Q3          Max 
## -2.752477543 -0.674750151 -0.007333444  0.679546648  3.225627947 
## 
## Residual standard error: 1.210605 
## Degrees of freedom: 400 total; 398 residual

2.3.3 Diagnostik serial correlation

 par(mfrow = c(1,2))
 acf(resid(fit_ols_ar1), main = "ACF Residuals (OLS)")
 acf(resid(fit_gls_ar1, type = "normalized"), main = "ACF Residuals (GLS)")

par(mfrow = c(1,1))
lmtest::dwtest(fit_ols_ar1) # indikasi korelasi serial pada OLS
## 
##  Durbin-Watson test
## 
## data:  fit_ols_ar1
## DW = 0.84993, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

2.3.4 Alternatif: Quasi-differencing (Cochrane–Orcutt) manual

# Estimasi rho dari regresi resid_t ~ resid_{t-1}
res <- resid(fit_ols_ar1)
rho_hat <- coef(lm(res[-1] ~ 0 + res[-length(res)]))[1]

# Quasi-differencing (transformasi data)
y_star <- dar1$y[-1]- rho_hat*dar1$y[-T]
x_star <- dar1$x[-1]- rho_hat*dar1$x[-T]
fit_qd <- lm(y_star ~ x_star)
summary(fit_qd)
## 
## Call:
## lm(formula = y_star ~ x_star)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6691 -0.6723 -0.0232  0.6970  3.2363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.40603    0.04955   8.194 3.52e-15 ***
## x_star       2.04499    0.03757  54.439  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9898 on 397 degrees of freedom
## Multiple R-squared:  0.8819, Adjusted R-squared:  0.8816 
## F-statistic:  2964 on 1 and 397 DF,  p-value: < 2.2e-16

Catatan: gls() memperkirakan 𝜌 dan ragam secara simultan dalam kerangka likelihood sehingga sering lebih stabil dibanding pendekatan dua tahap.

2.4 Latihan

set.seed(123)

#Paket yang diperlukan
pkgs <-c("dplyr","ggplot2","car","lmtest","sandwich","MASS","boot","broom")
new <-pkgs[!(pkgs%in% installed.packages()[,"Package"])]
if(length(new))install.packages(new, dependencies= TRUE)
lapply(pkgs,library, character.only= TRUE)
## [[1]]
##  [1] "nlme"      "splines"   "modelr"    "car"       "carData"   "lmtest"   
##  [7] "zoo"       "sandwich"  "broom"     "lubridate" "forcats"   "stringr"  
## [13] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [19] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [25] "methods"   "base"     
## 
## [[2]]
##  [1] "nlme"      "splines"   "modelr"    "car"       "carData"   "lmtest"   
##  [7] "zoo"       "sandwich"  "broom"     "lubridate" "forcats"   "stringr"  
## [13] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [19] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [25] "methods"   "base"     
## 
## [[3]]
##  [1] "nlme"      "splines"   "modelr"    "car"       "carData"   "lmtest"   
##  [7] "zoo"       "sandwich"  "broom"     "lubridate" "forcats"   "stringr"  
## [13] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [19] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [25] "methods"   "base"     
## 
## [[4]]
##  [1] "nlme"      "splines"   "modelr"    "car"       "carData"   "lmtest"   
##  [7] "zoo"       "sandwich"  "broom"     "lubridate" "forcats"   "stringr"  
## [13] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [19] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [25] "methods"   "base"     
## 
## [[5]]
##  [1] "nlme"      "splines"   "modelr"    "car"       "carData"   "lmtest"   
##  [7] "zoo"       "sandwich"  "broom"     "lubridate" "forcats"   "stringr"  
## [13] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [19] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [25] "methods"   "base"     
## 
## [[6]]
##  [1] "MASS"      "nlme"      "splines"   "modelr"    "car"       "carData"  
##  [7] "lmtest"    "zoo"       "sandwich"  "broom"     "lubridate" "forcats"  
## [13] "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"   
## [19] "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices" "utils"    
## [25] "datasets"  "methods"   "base"     
## 
## [[7]]
##  [1] "boot"      "MASS"      "nlme"      "splines"   "modelr"    "car"      
##  [7] "carData"   "lmtest"    "zoo"       "sandwich"  "broom"     "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices"
## [25] "utils"     "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "boot"      "MASS"      "nlme"      "splines"   "modelr"    "car"      
##  [7] "carData"   "lmtest"    "zoo"       "sandwich"  "broom"     "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices"
## [25] "utils"     "datasets"  "methods"   "base"

1. Simulasi Data

Tujuan: membuat data dengan 3 prediktor (X1, X2, X3), disuntik multikolinearitas ringan (X1 & X2 berkorelasi) dan heteroskedastisitas (ragam error meningkat saat |X1| besar), serta sedikit outlier.

n <- 200
X1 <- rnorm(n, 0, 1)
# X2 berkorelasi ~0.7 dengan X1
rho <- 0.7
X2 <- rho*X1 + sqrt(1-rho^2)*rnorm(n)
X3 <- rnorm(n)

# Heteroskedastisitas: sd error tergantung |X1|
eps <- rnorm(n, sd = 0.5 + 0.5*abs(X1))
# Model benar (ground truth): y = 2 + 1.5*X1- 0.8*X2 + 0.5*X3 + eps
y <- 2 + 1.5*X1- 0.8*X2 + 0.5*X3 + eps

# Tambah beberapa outlier pada respon
idx_out <- sample(1:n, 3)
y[idx_out] <- y[idx_out] + rnorm(3, mean = 6, sd = 1)

dat <- data.frame(y, X1, X2, X3)
summary(dat)
##        y                X1                 X2                 X3          
##  Min.   :-3.194   Min.   :-2.30917   Min.   :-2.36565   Min.   :-2.80977  
##  1st Qu.: 1.157   1st Qu.:-0.62576   1st Qu.:-0.68926   1st Qu.:-0.55753  
##  Median : 2.040   Median :-0.05874   Median : 0.03268   Median : 0.07583  
##  Mean   : 2.066   Mean   :-0.00857   Mean   : 0.02408   Mean   : 0.03178  
##  3rd Qu.: 2.895   3rd Qu.: 0.56840   3rd Qu.: 0.69036   3rd Qu.: 0.68098  
##  Max.   : 9.427   Max.   : 3.24104   Max.   : 3.00049   Max.   : 2.43023

2. Estimasi Parameter (OLS)

mod0 <- lm(y ~ X1 + X2 + X3, data = dat)
summary(mod0)
## 
## Call:
## lm(formula = y ~ X1 + X2 + X3, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3070 -0.6446 -0.0414  0.5284  7.1821 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.07742    0.08726  23.806  < 2e-16 ***
## X1           1.31536    0.12466  10.551  < 2e-16 ***
## X2          -0.75833    0.12303  -6.164 3.97e-09 ***
## X3           0.57737    0.09070   6.366 1.35e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.232 on 196 degrees of freedom
## Multiple R-squared:  0.4404, Adjusted R-squared:  0.4318 
## F-statistic: 51.41 on 3 and 196 DF,  p-value: < 2.2e-16
broom::glance(mod0) # ringkas: R^2, Adj R^2, AIC, dll.
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.440         0.432  1.23      51.4 1.47e-24     3  -324.  657.  674.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Catatan interpretasi cepat: Koefisien menunjukkan perubahan rata-rata y untuk kenaikan 1 unit pada prediktor (dengan prediktor lain konstan). Lihat Pr(>|t|) untuk signifikansi.

3. Uji Hipotesis 3.1 Uji serentak (overall F-test) Sudah tersedia pada output summary(mod0) (Signifikansi model secara keseluruhan).

3.2 Uji parsial (t-test per koefisien) Juga ada di summary(mod0).

3.3 Uji hipotesis gabungan (contoh:

car::linearHypothesis(mod0, c("X2 = 0", "X3 = 0"))
## 
## Linear hypothesis test:
## X2 = 0
## X3 = 0
## 
## Model 1: restricted model
## Model 2: y ~ X1 + X2 + X3
## 
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    198 423.38                                  
## 2    196 297.59  2    125.79 41.423 9.897e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.4 Uji dengan SE robust (menghadapi heteroskedastisitas)

# Koefisien + SE robust (HC3)
coeftest(mod0, vcov. = sandwich::vcovHC(mod0, type = "HC3"))
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  2.077419   0.087811 23.6578 < 2.2e-16 ***
## X1           1.315357   0.163042  8.0676 7.027e-14 ***
## X2          -0.758334   0.123626 -6.1341 4.644e-09 ***
## X3           0.577370   0.107728  5.3595 2.330e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Uji Asumsi

4.1 Multikolinearitas

# Variance Inflation Factor (VIF)
car::vif(mod0)
##       X1       X2       X3 
## 1.811923 1.816606 1.003660
# Kondisi numerik (opsional): kappa > 30 indikasi masalah
kappa(model.matrix(mod0))
## [1] 2.263679

Rule of thumb: VIF > 10 (atau > 5) → indikasi kolinearitas meningkat.

4.2 Heteroskedastisitas

# Breusch-Pagan
lmtest::bptest(mod0)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod0
## BP = 1.0283, df = 3, p-value = 0.7944
# White test (via bptest dengan kuadrat fitted)
lmtest::bptest(mod0, ~ fitted(mod0) + I(fitted(mod0)^2))
## 
##  studentized Breusch-Pagan test
## 
## data:  mod0
## BP = 5.3586, df = 2, p-value = 0.06861

Jika signifikan, gunakan SE robust (lihat 3.4) untuk inferensi yang lebih andal.

4.3 Normalitas residual

res <- rstandard(mod0)
shapiro.test(res) # sensitif untuk n besar; gunakan juga QQ-plot
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.87436, p-value = 7.729e-12
qqnorm(res); qqline(res)

4.4 Autokorelasi residual (opsional) (Biasanya untuk data runtun waktu)

lmtest::dwtest(mod0)
## 
##  Durbin-Watson test
## 
## data:  mod0
## DW = 2.0342, p-value = 0.5998
## alternative hypothesis: true autocorrelation is greater than 0

4.5 Uji spesifikasi (linearitas) – RamseyRESET

lmtest::resettest(mod0)
## 
##  RESET test
## 
## data:  mod0
## RESET = 4.2735, df1 = 2, df2 = 194, p-value = 0.01527

4.6 Plot Diagnostik Standar

par(mfrow=c(2,2))
plot(mod0)

par(mfrow=c(1,1))

5. Seleksi Model

Strategi: mulai dari model kandidat (termasuk transformasi sederhana) lalu gunakan AIC (stepwise), bandingkan dengan model awal,dan validasi via CV.

dat2 <-dat %>%
 mutate(X1_sq= X1^2, X2_sq= X2^2,X3_sq= X3^2,
 X1X2= X1*X2, X1X3= X1*X3,X2X3= X2*X3)

mod_full <-lm(y~X1 +X2 +X3+ X1_sq +X2_sq +X3_sq + X1X2+ X1X3 + X2X3,data= dat2)
AIC(mod0); AIC(mod_full)
## [1] 657.0574
## [1] 660.7653
#Stepwise berbasis AIC (dua arah)
mod_step <-MASS::stepAIC(mod_full,direction= "both", trace= FALSE)
summary(mod_step)
## 
## Call:
## lm(formula = y ~ X1 + X2 + X3 + X2_sq + X1X2, data = dat2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2286 -0.6853 -0.0298  0.4825  7.0242 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.19439    0.10487  20.925  < 2e-16 ***
## X1           1.31032    0.12513  10.472  < 2e-16 ***
## X2          -0.72799    0.12278  -5.929 1.37e-08 ***
## X3           0.57331    0.08996   6.373 1.32e-09 ***
## X2_sq       -0.28501    0.11895  -2.396   0.0175 *  
## X1X2         0.23671    0.13546   1.748   0.0821 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.22 on 194 degrees of freedom
## Multiple R-squared:  0.4571, Adjusted R-squared:  0.4431 
## F-statistic: 32.67 on 5 and 194 DF,  p-value: < 2.2e-16
broom::glance(mod_step)[,c("r.squared","adj.r.squared","AIC","BIC")]
## # A tibble: 1 × 4
##   r.squared adj.r.squared   AIC   BIC
##       <dbl>         <dbl> <dbl> <dbl>
## 1     0.457         0.443  655.  678.

Validasi silang (10-fold CV) untuk membandingkan RMSE:

# Fungsi CV RMSE menggunakan boot::cv.glm
cv_rmse <- function(model, data, K=10){
  # cv.glm mengembalikan delta: [1] raw CV, [2] adjusted CV
  set.seed(123)
  res <- boot::cv.glm(data = data, glmfit = model, K = K)
  sqrt(res$delta[1])
}

rmse_mod0   <- cv_rmse(mod0, dat)
rmse_modstep<- cv_rmse(mod_step, dat2)
c(RMSE_mod0 = rmse_mod0, RMSE_modStep = rmse_modstep)
##    RMSE_mod0 RMSE_modStep 
##          NaN          NaN

Pilih model dengan trade-off terbaik antara kesederhanaan dan kinerja (RMSE, AIC/BIC, Adj, R squared).

6. Evaluasi Outlier & Titik Berpengaruh

Gunakan studentized residuals, leverage (hat), dan Cook’s distance. Tambahkan Bonferroni outlier test.

infl <- influence.measures(mod0)
# Ringkasan pengaruh
summary(infl)
## Potentially influential observations of
##   lm(formula = y ~ X1 + X2 + X3, data = dat) :
## 
##     dfb.1_ dfb.X1 dfb.X2 dfb.X3 dffit   cov.r   cook.d hat    
## 16  -0.21  -0.06  -0.32   0.54  -0.78_*  0.94    0.15   0.07_*
## 18   0.30  -0.30  -0.24   0.09   0.74_*  0.75_*  0.13   0.03  
## 27   0.00   0.00   0.00   0.01   0.01    1.07_*  0.00   0.04  
## 56  -0.05  -0.05   0.01   0.11  -0.14    1.07_*  0.00   0.06  
## 65  -0.14   0.36  -0.34   0.17  -0.45_*  0.97    0.05   0.04  
## 90   0.31   0.25   0.04  -0.12   0.50_*  0.72_*  0.06   0.01  
## 97   0.00   0.00  -0.01   0.00  -0.01    1.08_*  0.00   0.05  
## 135 -0.03   0.07  -0.01  -0.08  -0.12    1.07_*  0.00   0.05  
## 147  0.19  -0.21  -0.01  -0.25   0.42    0.92_*  0.04   0.03  
## 149  0.23   0.54  -0.20   0.60   0.86_*  0.89_*  0.18   0.07_*
## 160 -0.02   0.04  -0.04   0.01  -0.05    1.06_*  0.00   0.04  
## 162  0.43  -0.56   0.28   0.26   0.78_*  0.48_*  0.13   0.01  
## 164 -0.20  -0.38  -0.25  -0.35  -0.87_*  0.95    0.18   0.08_*
## 191  0.03   0.02  -0.02  -0.08   0.08    1.07_*  0.00   0.05
# Statistik diagnostik
studres <- rstudent(mod0)
lev <- hatvalues(mod0)
cookd <- cooks.distance(mod0)

# Ambang sederhana
thr_res <- 3
# |studentized residual| > 3
thr_lev <- 2*length(coef(mod0))/nrow(dat) # leverage tinggi
thr_cook <- 4/nrow(dat) # Cook's distance besar

flag <- data.frame(
 id = 1:nrow(dat),
 studres = studres,
 leverage = lev,
 cookd = cookd
) %>%
 mutate(
 flag_res = abs(studres) > thr_res,
 flag_lev = leverage > thr_lev,
 flag_cook = cookd > thr_cook,
 any_flag = flag_res | flag_lev | flag_cook
) %>%
 arrange(desc(any_flag), desc(abs(studres)))

head(flag, 10)
##      id   studres   leverage      cookd flag_res flag_lev flag_cook any_flag
## 162 162  6.450812 0.01441863 0.12607179     TRUE    FALSE      TRUE     TRUE
## 90   90  4.302679 0.01343365 0.05785187     TRUE    FALSE      TRUE     TRUE
## 18   18  4.188058 0.03056365 0.12748735     TRUE    FALSE      TRUE     TRUE
## 149 149  3.243249 0.06634585 0.17821039     TRUE     TRUE      TRUE     TRUE
## 164 164 -2.855691 0.08454000 0.18164105    FALSE     TRUE      TRUE     TRUE
## 16   16 -2.803489 0.07241620 0.14821120    FALSE     TRUE      TRUE     TRUE
## 147 147  2.584024 0.02573283 0.04284913    FALSE    FALSE      TRUE     TRUE
## 143 143  2.200166 0.03420638 0.04203837    FALSE    FALSE      TRUE     TRUE
## 30   30  2.172800 0.03120105 0.03730329    FALSE    FALSE      TRUE     TRUE
## 65   65 -2.140175 0.04228760 0.04965406    FALSE     TRUE      TRUE     TRUE

Bonferroni outlier test (car):

car::outlierTest(mod0) #menguji outlier pada residual studentized dengan koreksi Bonferroni
##     rstudent unadjusted p-value Bonferroni p
## 162 6.450812         8.5823e-10   1.7165e-07
## 90  4.302679         2.6674e-05   5.3348e-03
## 18  4.188058         4.2583e-05   8.5167e-03

Plot cepat untuk Cook’s distance:

plot(cookd, type="h", main="Cook's Distance", ylab="D", xlab="Observasi")
abline(h= thr_cook, lty=2)

Tindak lanjut (opsional):

Coba refit tanpa observasi berpengaruh ekstrem dan bandingkan koefisien/fit.

Pertimbangkan robust regression (M-estimator) jika banyak outlier: MASS::rlm.

# Refit tanpa titik yang sangat berpengaruh (contoh)
to_drop <- which(cookd > thr_cook | abs(studres) > thr_res)
mod_refit <- lm(y ~ X1 + X2 + X3, data = dat[-to_drop, ])
broom::tidy(mod0)
## # A tibble: 4 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    2.08     0.0873     23.8  9.73e-60
## 2 X1             1.32     0.125      10.6  6.73e-21
## 3 X2            -0.758    0.123      -6.16 3.97e- 9
## 4 X3             0.577    0.0907      6.37 1.35e- 9
broom::tidy(mod_refit)
## # A tibble: 4 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    1.99     0.0566     35.1  2.69e-83
## 2 X1             1.44     0.0861     16.7  1.11e-38
## 3 X2            -0.676    0.0819     -8.26 2.87e-14
## 4 X3             0.501    0.0606      8.27 2.75e-14
  1. Ringkasan & Rekomendasi
  • Cek signifikansi koefisien (t-test) dan kecocokan model (F-test, R²).
  • Perhatikan multikolinearitas (VIF); bila tinggi, pertimbangkan transformasi, pengurangan variabel, atau ridge/lasso.
  • Jika heteroskedastisitas terdeteksi, gunakan SE robust untuk inferensi atau model varian (mis. WLS).
  • Lakukan seleksi model (AIC/BIC, CV) untuk keseimbangan bias–varian.
  • Evaluasi outlier/pengaruh; pertimbangkan robust regression bila perlu.
  • Validasi hasil secara substantif (masuk akal secara domain) dan replikasi (seed, pipeline).

3 Tugas 1

3.1 Praktikum (hands-on)

  1. Diagnostik lengkap OLS pada data mtcars: heteroskedastisitas (Breusch–Pagan), multikolinearitas (VIF), pengaruh (Cook’s D), normalitas (QQ-plot). Laporkan temuan & rekomendasi.

Jawab:

Import Data

library(car)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

Eksplorasi Data

library(dplyr)
library(tidyr)
library(ggplot2)

# Buat data long hanya dengan variabel numerik
dat_long <- mtcars %>%
  dplyr::select(-vs, -am) %>%   # buang variabel kategorik
  pivot_longer(-mpg, names_to = "variabel", values_to = "nilai")

# Plot
ggplot(dat_long, aes(x = nilai, y = mpg)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ variabel, scales = "free_x") +
  labs(y = "Miles per Gallon (mpg)", 
       title = "Hubungan mpg dengan variabel numerik pada mtcars")

Interpretasi:

  • Terdapat hubungan negatif antara carb dengan mpg

  • Terdapat hubungan negatif antara cyl dengan mpg

  • Terdapat hubungan negatif antara disp dengan mpg

  • Terdapat hubungan positif antara drat dengan mpg

  • Terdapat hubungan positif antara gear dengan mpg

  • Terdapat hubungan negatif antara hp dengan mpg

  • Terdapat hubungan positif antara qsec dengan mpg

  • Terdapat hubungan negatif antara wt dengan mpg

# Variabel kategorik di mtcars
vars_kat <- c("vs", "am")

# Ubah ke long format
dat_kat <- mtcars %>%
  pivot_longer(all_of(vars_kat), names_to = "variabel", values_to = "kategori")

# Boxplot untuk semua variabel kategorik
ggplot(dat_kat, aes(x = factor(kategori), y = mpg)) +
  geom_boxplot(fill = "skyblue", alpha = 0.6) +
  facet_wrap(~ variabel, scales = "free_x") +
  labs(x = "Kategori", y = "Miles per Gallon (mpg)",
       title = "Boxplot mpg terhadap variabel kategorik pada mtcars")

Interpretasi:

  • am = 1 cenderung memiliki mpg lebih tinggi dibandingkan am = 0.

  • carb = 1 dan carb = 2 cenderung memiliki mpg lebih tinggi dibandingkan kategori lainnya.

  • Semakin besar cyl, maka mpg semakin rendah.

  • gear = 4 cenderung memiliki mpg lebih tinggi dibanding dengan gear = 3 atau gear = 5.

  • vs = 1 memiliki mpg lebih tinggi dibandingkan vs = 0.

Estimasi OLS & Ringkasan

m1 <- lm(mpg ~ cyl+disp+hp+drat+wt+qsec+factor(vs)+factor(am)+gear+carb, data = mtcars)
summary(m1)
## 
## Call:
## lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + factor(vs) + 
##     factor(am) + gear + carb, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## factor(vs)1  0.31776    2.10451   0.151   0.8814  
## factor(am)1  2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07
tidy(m1,conf.int= TRUE)
## # A tibble: 11 × 7
##    term        estimate std.error statistic p.value conf.low conf.high
##    <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 (Intercept)  12.3      18.7        0.657  0.518  -26.6      51.2   
##  2 cyl          -0.111     1.05      -0.107  0.916   -2.28      2.06  
##  3 disp          0.0133    0.0179     0.747  0.463   -0.0238    0.0505
##  4 hp           -0.0215    0.0218    -0.987  0.335   -0.0668    0.0238
##  5 drat          0.787     1.64       0.481  0.635   -2.61      4.19  
##  6 wt           -3.72      1.89      -1.96   0.0633  -7.65      0.224 
##  7 qsec          0.821     0.731      1.12   0.274   -0.699     2.34  
##  8 factor(vs)1   0.318     2.10       0.151  0.881   -4.06      4.69  
##  9 factor(am)1   2.52      2.06       1.23   0.234   -1.76      6.80  
## 10 gear          0.655     1.49       0.439  0.665   -2.45      3.76  
## 11 carb         -0.199     0.829     -0.241  0.812   -1.92      1.52

1.1. heteroskedastisitas (Breusch–Pagan)

  1. Hipotesis

H0 : tidak ada heteroskedastisitas (varian residual konstan/homoskedastis)

H1 : ada heteroskedastisitas (varian residual tidak konstan)

  1. Statistik Uji
bp <- bptest(m1)
bp
## 
##  studentized Breusch-Pagan test
## 
## data:  m1
## BP = 14.914, df = 10, p-value = 0.1352
  1. Kesimpulan
## Karena p-value: 0.135 lebih dari atau sama dengan 0.05. Maka H0 tidak ditolak, sehingga tidak ada heteroskedastisitas

1.2. Multikolinearitas (VIF)

vif_val <- vif(m1)

vif_tbl <- tibble(
  variable = names(vif_val),
  VIF = as.numeric(vif_val)
) %>%
  arrange(desc(VIF))

vif_tbl
## # A tibble: 10 × 2
##    variable     VIF
##    <chr>      <dbl>
##  1 disp       21.6 
##  2 cyl        15.4 
##  3 wt         15.2 
##  4 hp          9.83
##  5 carb        7.91
##  6 qsec        7.53
##  7 gear        5.36
##  8 factor(vs)  4.97
##  9 factor(am)  4.65
## 10 drat        3.37

Interpretasi: Nilai VIF besar (≳ 10) mengindikasikan multikolinearitas kuat. Variabel disp, cyl, wt mempunyai VIF > 10. Sehingga mengindikasikan multikolinearitas yang kuat. Variabel disp akan dihapus.

m2 <- lm(mpg ~ . -disp, data = mtcars)

vif_val2 <- vif(m2)

vif_tbl2 <- tibble(
  variable = names(vif_val2),
  VIF = as.numeric(vif_val2)
) %>%
  arrange(desc(VIF))

vif_tbl2
## # A tibble: 9 × 2
##   variable   VIF
##   <chr>    <dbl>
## 1 cyl      14.3 
## 2 hp        7.12
## 3 qsec      6.91
## 4 wt        6.19
## 5 gear      5.32
## 6 vs        4.92
## 7 am        4.65
## 8 carb      4.31
## 9 drat      3.33

Interpretasi: Setelah disp dihapus, variabel cyl mempunyai VIF > 10. Sehingga mengindikasikan multikolinearitas yang kuat. Variabel cyl akan dihapus.

m3 <- lm(mpg ~ . -disp -cyl, data = mtcars)

vif_val3 <- vif(m3)

vif_tbl3 <- tibble(
  variable = names(vif_val3),
  VIF = as.numeric(vif_val3)
) %>%
  arrange(desc(VIF))

vif_tbl3
## # A tibble: 8 × 2
##   variable   VIF
##   <chr>    <dbl>
## 1 wt        6.05
## 2 hp        6.02
## 3 qsec      5.92
## 4 gear      4.69
## 5 carb      4.29
## 6 am        4.29
## 7 vs        4.27
## 8 drat      3.11

Interpretasi: Setelah cyl dihapus, semua variabel memiliki VIF < 10. Sehingga model terbebas dari multikolinearitas

1.3. Pengaruh (Cook’s D)

cookd <- cooks.distance(m3)
thr_cook <- 4/nrow(mtcars)

flag <- data.frame(
  id = 1:nrow(mtcars),
  car=mtcars$car,
  cookd = cookd
  ) %>%
  mutate(
    flag_cook = cookd > thr_cook,
    ) %>%
 arrange(desc(cookd))

head(flag, 10)
##                   id car      cookd flag_cook
## Ford Pantera L    29   4 0.35091992      TRUE
## Merc 230           9   2 0.33030351      TRUE
## Lotus Europa      28   2 0.16327114      TRUE
## Chrysler Imperial 17   4 0.14434739      TRUE
## Toyota Corolla    20   1 0.10934157     FALSE
## Maserati Bora     31   8 0.10783468     FALSE
## Datsun 710         3   1 0.07329257     FALSE
## Fiat 128          18   1 0.07048882     FALSE
## Volvo 142E        32   2 0.06851912     FALSE
## Valiant            6   1 0.04680887     FALSE
plot(cookd, type="h", main="Cook's Distance", ylab="D", xlab="Observasi")
abline(h= thr_cook, lty=2, col="red")

Interpretasi: Ford Pantera L, Merc 230, Lotus Europa, Chrysler Imperial memiliki Cook’s Distance tinggi, sehingga berpengaruh terhadap model regresi. Titik berpengaruh ini perlu diperiksa lebih lanjut atau ditangani dengan pendekatan robust regression bila diperlukan.

# Model tanpa titik berpengaruh
mtcars_no_out <- mtcars[-c(9, 17, 28, 29), ]  
m4 <- lm(mpg ~ . -disp -cyl, data = mtcars_no_out)
summary(m4)
## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars_no_out)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4398 -1.5273 -0.0027  1.1378  4.2436 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -18.25532   21.70599  -0.841   0.4108  
## hp            0.01297    0.01969   0.659   0.5179  
## drat          2.40851    1.65495   1.455   0.1619  
## wt           -3.60201    1.28552  -2.802   0.0114 *
## qsec          2.03098    1.03338   1.965   0.0642 .
## vs           -3.14638    2.40718  -1.307   0.2068  
## am            1.10451    1.85774   0.595   0.5592  
## gear          1.74121    1.57344   1.107   0.2823  
## carb         -0.81296    0.69539  -1.169   0.2568  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.299 on 19 degrees of freedom
## Multiple R-squared:  0.8959, Adjusted R-squared:  0.852 
## F-statistic: 20.43 on 8 and 19 DF,  p-value: 8.687e-08
# Bandingkan koefisien
coef(m3)
## (Intercept)          hp        drat          wt        qsec          vs 
## 13.80810376 -0.01225158  0.88893522 -2.60967758  0.63983256  0.08786399 
##          am        gear        carb 
##  2.42417670  0.69389707 -0.61286048
coef(m4)
##  (Intercept)           hp         drat           wt         qsec           vs 
## -18.25531997   0.01297237   2.40850571  -3.60200564   2.03098426  -3.14638244 
##           am         gear         carb 
##   1.10450897   1.74121199  -0.81295692

Interpretasi: Karena koefisien berubah drastis (arah hubungan berubah dari positif ke negatif atau sebaliknya), sehingga model sangat dipengaruhi titik tersebut. Dapat diperiksa apakah input data tersebut benar/tidak lalu dapat dipertimbangkan menggunakan robust regression.

1.4. Normalitas (QQ-plot)

plot(m3, which=2, id.n = 0)

Interpretasi: QQ plot menunjukkan bahwa sebagian besar residual mengikuti garis diagonal, sehingga asumsi normalitas residual terpenuhi.

summary(m3)
## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08
  1. Robust SE (HC3) vs WLS pada data simulasi heteroskedastik; bandingkan standar error dan p-value.
library(dplyr)
library(sandwich)
library(lmtest)

set.seed(123)
n <- 400
x1 <- runif(n, 0, 10)
x2 <- rnorm(n, 5, 2)
sigma_i <- 0.6 + 0.2 * x1  # var naik dengan x1
err <- rnorm(n, 0, sigma_i)
y <- 2 + 1.5*x1 - 1.2*x2 + err

sim <- tibble(y, x1, x2, w = 1/sigma_i^2)

fit_ols <- lm(y ~ x1 + x2, data = sim)
fit_wls <- lm(y ~ x1 + x2, data = sim, weights = w)

# --- perbandingan standar error & p-value
comp <- tibble(
  term = names(coef(fit_ols)),
  OLS_SE = coef(summary(fit_ols))[, "Std. Error"],
  Robust_SE = sqrt(diag(vcovHC(fit_ols, type = "HC3"))),
  WLS_SE = coef(summary(fit_wls))[, "Std. Error"],
  OLS_p = coef(summary(fit_ols))[, "Pr(>|t|)"],
  Robust_p = coeftest(fit_ols, vcovHC(fit_ols, type = "HC3"))[, 4],

  WLS_p = coef(summary(fit_wls))[, "Pr(>|t|)"]
)

comp
## # A tibble: 3 × 7
##   term        OLS_SE Robust_SE WLS_SE     OLS_p  Robust_p     WLS_p
##   <chr>        <dbl>     <dbl>  <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept) 0.281     0.267  0.191  5.07e- 11 5.65e- 12 4.04e- 18
## 2 x1          0.0300    0.0313 0.0267 1.13e-175 1.12e-169 3.75e-194
## 3 x2          0.0435    0.0466 0.0333 7.83e- 94 2.83e- 86 3.64e-124

Interpretasi: Semua variabel signifikan (p-value <0.05). SE WLS paling kecil sehingga WLS memberikan estimasi terbaik.

  1. Spline vs polinomial: modelkan mpg ~ wt dan bandingkan AIC/BIC + 5-fold CV.
dat1 <- mtcars %>%
  as_tibble(rownames = "car") %>%
  mutate(across(c(mpg, disp, hp, wt, qsec), as.double))

AIC

fit_poly <-lm(mpg~ poly(wt, 2, raw= TRUE) +hp+ disp, data= dat1)
fit_spl <-lm(mpg~ bs(wt, df= 5) + hp+ disp,data= dat1)
AIC(fit_poly,fit_spl) %>% arrange(AIC)
##          df      AIC
## fit_poly  6 150.7305
## fit_spl   9 154.3323

BIC

BIC(fit_poly,fit_spl) %>% arrange(BIC)
##          df      BIC
## fit_poly  6 159.5250
## fit_spl   9 167.5239
set.seed(123)
k <-5
cv_res <-crossv_kfold(dat1,k= k)
 
rmse_fun <-function(train,test,formula){
  m <-lm(formula,data= as.data.frame(train))
  sqrt(mean((as.data.frame(test)$mpg-predict(m,newdata=as.data.frame(test)))^2))
}

forms<-list(
  Poly= mpg~ poly(wt, 2, raw= TRUE) + hp+ disp,
  Spline= mpg ~ bs(wt, df= 5)+ hp+ disp
  )

cv_tbl <-map_dfr(names(forms),function(nm){
  fml<-forms[[nm]]
  tibble(model= nm,fold = seq_len(k),
         rmse= map2_dbl(cv_res$train,cv_res$test,~ rmse_fun(.x,.y,fml)))
  })

cv_tbl %>%
  group_by(model) %>%
  summarise(mean_rmse= mean(rmse), sd_rmse= sd(rmse)) %>%
  arrange(mean_rmse)
## # A tibble: 2 × 3
##   model  mean_rmse sd_rmse
##   <chr>      <dbl>   <dbl>
## 1 Poly        2.35   0.980
## 2 Spline      2.74   0.807
library(dplyr)
library(tidyr)

# AIC & BIC
ic_tbl <- tibble(
  Model = c("Poly", "Spline"),
  AIC = c(AIC(fit_poly), AIC(fit_spl)),
  BIC = c(BIC(fit_poly), BIC(fit_spl))
)

# CV RMSE
cv_summary <- cv_tbl %>%
  group_by(model) %>%
  summarise(
    mean_RMSE = mean(rmse),
    sd_RMSE = sd(rmse),
    .groups = "drop"
  ) %>%
  rename(Model = model)

# Gabungkan semua
final_tbl <- ic_tbl %>%
  left_join(cv_summary, by = "Model")

final_tbl
## # A tibble: 2 × 5
##   Model    AIC   BIC mean_RMSE sd_RMSE
##   <chr>  <dbl> <dbl>     <dbl>   <dbl>
## 1 Poly    151.  160.      2.35   0.980
## 2 Spline  154.  168.      2.74   0.807

Interpretasi: Model Polynomial lebih baik daripada Spline, karena memiliki AIC dan BIC lebih kecil serta mean RMSE cross-validation lebih rendah.

  1. AR(1) GLS: gunakan kode GLS di atas, ganti x dengan sinyal musiman (mis. sin(2pit/12)), interpretasikan ρ; bandingkan SE OLS vs GLS.
library(tibble)

set.seed(42)
T <- 400
t <- 1:T

rho <- 0.6
u <- rnorm(T, 0, 1)
err <- numeric(T)
for (i in 2:T) err[i] <- rho * err[i-1] + u[i]

# --- x jadi sinyal musiman
x <- sin(2*pi*t/12)
y <- 1 + 2*x + err

dar1 <- tibble(t = t, x = x, y = y)
head(dar1)
## # A tibble: 6 × 3
##       t        x     y
##   <int>    <dbl> <dbl>
## 1     1 5   e- 1  2   
## 2     2 8.66e- 1  2.17
## 3     3 1   e+ 0  3.02
## 4     4 8.66e- 1  3.38
## 5     5 5   e- 1  2.79
## 6     6 1.22e-16  1.37

OLS vs GLS

library(nlme)

# OLS biasa
fit_ols_ar1 <- lm(y ~ x, data = dar1)

# GLS dengan korelasi AR(1) di residual
fit_gls_ar1 <- gls(y ~ x, data = dar1,
                   correlation = corAR1(form = ~ t))
summary(fit_ols_ar1)
## 
## Call:
## lm(formula = y ~ x, data = dar1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0510 -0.7985  0.0059  0.7792  3.6370 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.96844    0.05664    17.1   <2e-16 ***
## x            1.83909    0.07995    23.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.133 on 398 degrees of freedom
## Multiple R-squared:  0.5707, Adjusted R-squared:  0.5697 
## F-statistic: 529.2 on 1 and 398 DF,  p-value: < 2.2e-16
summary(fit_gls_ar1)
## Generalized least squares fit by REML
##   Model: y ~ x 
##   Data: dar1 
##        AIC      BIC    logLik
##   1109.051 1124.997 -550.5254
## 
## Correlation Structure: AR(1)
##  Formula: ~t 
##  Parameter estimate(s):
##       Phi 
## 0.5440471 
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## (Intercept) 0.9724752 0.1043172  9.322293       0
## x           1.8390741 0.1132805 16.234685       0
## 
##  Correlation: 
##   (Intr)
## x -0.013
## 
## Standardized residuals:
##          Min           Q1          Med           Q3          Max 
## -3.566387570 -0.705789893  0.001620549  0.681785092  3.195184639 
## 
## Residual standard error: 1.137024 
## Degrees of freedom: 400 total; 398 residual
se_ols <- coef(summary(fit_ols_ar1))[, "Std. Error"]
se_gls <- summary(fit_gls_ar1)$tTable[, "Std.Error"]
cbind(SE_OLS = se_ols, SE_GLS = se_gls)
##                 SE_OLS    SE_GLS
## (Intercept) 0.05663725 0.1043172
## x           0.07994741 0.1132805

Nilai SE pada model GLS lebih besar dibandingkan dengan model OLS. Artinya model GLS lebih valid karena memperhitungkan struktur autokorelasi di error. Sehingga, SE mencerminkan ketidakpastian yang sebenarnya ada.

Diagnostik Serial Correlation

par(mfrow = c(1,2))
acf(resid(fit_ols_ar1), main = "ACF Residuals (OLS)")
acf(resid(fit_gls_ar1, type = "normalized"), main = "ACF Residuals (GLS)")

- OLS : Residual OLS menunjukkan autokorelasi positif yang signifikan pada lag-lag awal (melewati garis biru).

  • GLS : Sebagian besar batang dalam batas signifikansi, menunjukkan GLS berhasil mengoreksi autokorelasi residual.
par(mfrow = c(1,1))
  1. Hipotesis

H0 : Tidak ada autokorelasi residual

H1 : Ada autokorelasi residual

  1. Statistik Uji
dw=lmtest::dwtest(fit_ols_ar1)
dw
## 
##  Durbin-Watson test
## 
## data:  fit_ols_ar1
## DW = 0.92047, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
dw$p.value
## [1] 8.988689e-28
  1. Kesimpulan Karena p-value = 2.2e-16 < 0.05 maka H0 ditolak, artinya ada autokorelasi residual. DW < 2 mengindikasikan autokorelasi positif.
# Estimasi rho
rho_hat <- coef(fit_gls_ar1$modelStruct$corStruct, unconstrained = FALSE)
cat("Estimated rho =", rho_hat, "\n")
## Estimated rho = 0.5440471

ρ^ = 0.544 artinya residual saat ini masih berkorelasi positif ~54% dengan residual sebelumnya.

4 Latihan 2

4.1 Pertumbuhan bakteri mengikuti kurva logistik

set.seed(42)
 library(ggplot2)
 library(dplyr)
 library(tidyr)
# Parameter
K <- 1.0  # kapasitas maksimum (misal proporsi)
alpha <- -3.0  # intercept logit
beta <- 0.9  # laju pertumbuhan
sigma <- 0.03  # noise

# Data
t <- seq(0, 8, length.out = 120)
mu <- K/(1 + exp(-(alpha + beta*t)))
y <- mu + rnorm(length(t), sd = sigma)
 
logistic_df <- tibble(t= t, y= y, mu = mu)
 
# Plot
ggplot(logistic_df, aes(t, y)) +
 geom_point(alpha = 0.6) +
 geom_line(aes(y = mu), linewidth = 1) +
 labs(title = "Pertumbuhan Bakteri (Kurva Logistik)",
 x= "Waktu (t)", y= "Ukuran/Proporsi Populasi") +
 theme_minimal()

4.2 Respon obat mengikuti fungsi Michaelis-Menten

# Parameter
Vmax <- 2.0
Km <- 1.2
sigma <- 0.06
 
# Data
X <- seq(0, 6, length.out = 80)
mu <- (Vmax * X) / (Km + X)
Y <- mu + rnorm(length(X), sd = sigma)

mm_df <- tibble(X = X, Y = Y, mu = mu)

ggplot(mm_df, aes(X, Y)) +
 geom_point(alpha = 0.6) +
 geom_line(aes(y = mu), linewidth = 1) +
 labs(title = "Respon Obat (Michaelis–Menten)",
 x= "Konsentrasi Obat (X)", y= "Respon (Y)") +
 theme_minimal()

4.3 Ekonomi: diminishing return mengikuti bentuk kurva nonlinear

# Parameter
A <- 5
alpha <- 0.5 # < 1 → diminishing marginal return
sigma <- 0.3

# Data
X <- seq(0, 100, length.out = 120)
mu <- A * X^alpha
Y <- mu + rnorm(length(X), sd = sigma)

dr_df <- tibble(X = X, Y = Y, mu = mu)

ggplot(dr_df, aes(X, Y)) +
 geom_point(alpha = 0.6) +
 geom_line(aes(y = mu), linewidth = 1) +
 labs(title = "Diminishing Return: Y = A * X^{alpha}, 0<alpha<1",
 x= "Input/Modal (X)", y= "Output (Y)") +
 theme_minimal()

4.4 Estimasi parameter menggunakan metode iteratif (Gauss-Newton, Levenberg-Marquardt).

set.seed(123)
library(ggplot2)
library(dplyr)
library(tidyr)

beta0_true <- 2.0
beta1_true <- 0.3
sigma_true <- 0.4

# Small dataset agar mudah ditelusuri langkah demi langkah
x <- c(0, 1, 2, 3, 4, 5)
f_true <- beta0_true * exp(beta1_true * x)
y <- f_true + rnorm(length(x), sd = sigma_true)

dat <- tibble(x, y, f_true)
dat
## # A tibble: 6 × 3
##       x     y f_true
##   <dbl> <dbl>  <dbl>
## 1     0  1.78   2   
## 2     1  2.61   2.70
## 3     2  4.27   3.64
## 4     3  4.95   4.92
## 5     4  6.69   6.64
## 6     5  9.65   8.96
ggplot(dat, aes(x, y)) +
 geom_point(size=2) +
 geom_line(aes(y = f_true), linewidth = 0.9) +
 labs(title="Data Eksponensial (dengan noise) & Kurva Sejati",
 x="X", y="Y") +
 theme_minimal()

f_exp <- function(x, theta){ # theta = c(beta0, beta1)
 beta0 <- theta[1]; beta1 <- theta[2]
 beta0 * exp(beta1 * x)
}

resid_vec <- function(x, y, theta){
 y- f_exp(x, theta)
}

J_exp <- function(x, theta){
 beta0 <- theta[1]; beta1 <- theta[2]
 e <- exp(beta1 * x)
 cbind( d_beta0 = e,
 d_beta1 = beta0 * x * e)
}
gn_step <- function(x, y, theta){
 r <- resid_vec(x, y, theta)
 J <- J_exp(x, theta)
 # Normal equations
 lhs <- crossprod(J, J)
 rhs <- crossprod(J, r)
 # J^T J
 # J^T r
 Delta <- solve(lhs, rhs) # langkah GN
list(Delta = as.numeric(Delta),
 theta_new = theta + as.numeric(Delta),
 r= r, J= J, lhs = lhs, rhs = rhs,
 SSE = sum(r^2))
}

theta0 <- c(beta0 = 1.0, beta1 = 0.0) # tebakan awal sengaja "kasar"
 step1 <- gn_step(x, y, theta0)
 step1
## $Delta
## [1] 0.2542352 1.4943019
## 
## $theta_new
##    beta0    beta1 
## 1.254235 1.494302 
## 
## $r
## [1] 0.7758097 1.6076466 3.2677209 3.9474096 5.6919489 8.6494041
## 
## $J
##      d_beta0 d_beta1
## [1,]       1       0
## [2,]       1       1
## [3,]       1       2
## [4,]       1       3
## [5,]       1       4
## [6,]       1       5
## 
## $lhs
##         d_beta0 d_beta1
## d_beta0       6      15
## d_beta1      15      55
## 
## $rhs
##             [,1]
## d_beta0 23.93994
## d_beta1 86.00013
## 
## $SSE
## [1] 136.6569