library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Pendahuluan

Pada analisis ini kita akan membandingkan dua model regresi linear sederhana yang menghubungkan konsentrasi Ozone (variabel dependen) dengan dua variabel prediktor terpisah: Temp (suhu) dan Wind (kecepatan angin). Tujuan: menilai model mana yang lebih baik menurut metrik MSE, AIC, dan Adjusted R-squared, serta memberikan insight.

Data

Kita pakai dataset airquality (dataset built-in R). Dataset ini berisi pengukuran kualitas udara di New York (bulan Mei–September 1973) termasuk Ozone, Temp, Wind, Solar.R, dll.

data("airquality")
aq <- airquality   # langsung saja, sudah berupa data.frame

# info singkat
summary(aq)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
# Bersihkan data
aq_clean <- aq %>%
  dplyr::select(Ozone, Temp, Wind) %>%
  tidyr::drop_na()

knitr::kable(head(aq_clean, 10))
Ozone Temp Wind
41 67 7.4
36 72 8.0
12 74 12.6
18 62 11.5
28 66 14.9
23 65 8.6
19 59 13.8
8 61 20.1
7 74 6.9
16 69 9.7

Model 1 — Ozone ~ Temp

## 1. Model Ozone ~ Temp

# Model regresi Ozone ~ Temp
model_temp <- lm(Ozone ~ Temp, data = airquality)
summary(model_temp)
## 
## Call:
## lm(formula = Ozone ~ Temp, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.729 -17.409  -0.587  11.306 118.271 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -146.9955    18.2872  -8.038 9.37e-13 ***
## Temp           2.4287     0.2331  10.418  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.71 on 114 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.4877, Adjusted R-squared:  0.4832 
## F-statistic: 108.5 on 1 and 114 DF,  p-value: < 2.2e-16
# Scatter plot + garis regresi
plot(airquality$Temp, airquality$Ozone,
     main = "Scatter Plot Ozone ~ Temp",
     xlab = "Temperature (Temp)",
     ylab = "Ozone",
     pch = 19, col = "skyblue")
abline(model_temp, col = "red", lwd = 2)

# Hitung MSE, AIC, dan Adjusted R-squared
mse_temp <- mean(model_temp$residuals^2)
aic_temp <- AIC(model_temp)
adjr2_temp <- summary(model_temp)$adj.r.squared

mse_temp; aic_temp; adjr2_temp
## [1] 552.6715
## [1] 1067.706
## [1] 0.4832134

Plot: Ozone vs Temp + garis regresi

ggplot(aq_clean, aes(x = Temp, y = Ozone)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Ozone vs Temp", subtitle = "Model: Ozone ~ Temp",
       x = "Temperature (F)", y = "Ozone (ppb)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Model 2 — Ozone ~ Wind

# Model regresi Ozone ~ Wind
model_wind <- lm(Ozone ~ Wind, data = airquality)
summary(model_wind)
## 
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.572 -18.854  -4.868  15.234  90.000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96.8729     7.2387   13.38  < 2e-16 ***
## Wind         -5.5509     0.6904   -8.04 9.27e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.47 on 114 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.3619, Adjusted R-squared:  0.3563 
## F-statistic: 64.64 on 1 and 114 DF,  p-value: 9.272e-13
# Scatter plot + garis regresi
plot(airquality$Wind, airquality$Ozone,
     main = "Scatter Plot Ozone ~ Wind",
     xlab = "Wind",
     ylab = "Ozone",
     pch = 19, col = "lightgreen")
abline(model_wind, col = "red", lwd = 2)

# Hitung MSE, AIC, dan Adjusted R-squared
mse_wind <- mean(model_wind$residuals^2)
aic_wind <- AIC(model_wind)
adjr2_wind <- summary(model_wind)$adj.r.squared

mse_wind; aic_wind; adjr2_wind
## [1] 688.4398
## [1] 1093.187
## [1] 0.3562605

Plot: Ozone vs Wind + garis regresi

ggplot(aq_clean, aes(x = Wind, y = Ozone)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Ozone vs Wind", subtitle = "Model: Ozone ~ Wind",
       x = "Wind (mph)", y = "Ozone (ppb)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Perbandingan Model

# Tabel perbandingan
comparison <- data.frame(
  Model = c("Ozone ~ Temp", "Ozone ~ Wind"),
  MSE = c(mse_temp, mse_wind),
  AIC = c(aic_temp, aic_wind),
  Adjusted_R2 = c(adjr2_temp, adjr2_wind)
)

knitr::kable(comparison, caption = "Perbandingan Kinerja Model Regresi")
Perbandingan Kinerja Model Regresi
Model MSE AIC Adjusted_R2
Ozone ~ Temp 552.6715 1067.706 0.4832134
Ozone ~ Wind 688.4398 1093.187 0.3562605

Insight dan Pendapat saya

  1. MSE (Mean Squared Error) Model dengan nilai MSE lebih kecil dianggap memberikan prediksi yang lebih akurat. Bandingkan nilai MSE dari kedua model, dan pilih yang paling kecil.

  2. AIC (Akaike Information Criterion) Semakin kecil nilai AIC, semakin baik model dalam hal keseimbangan antara goodness-of-fit dan kompleksitas model. Bandingkan nilai AIC keduanya.

  3. Adjusted R-squared Nilai Adjusted R-squared menunjukkan proporsi variasi Ozone yang dapat dijelaskan oleh variabel prediktor. Nilai yang lebih tinggi menandakan model lebih baik menjelaskan data.

  4. Interpretasi Koefisien

  1. Kesimpulan Praktis