library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
Pada analisis ini kita akan membandingkan dua model regresi linear
sederhana yang menghubungkan konsentrasi Ozone (variabel
dependen) dengan dua variabel prediktor terpisah: Temp
(suhu) dan Wind (kecepatan angin). Tujuan: menilai model
mana yang lebih baik menurut metrik MSE, AIC, dan Adjusted R-squared,
serta memberikan insight.
Kita pakai dataset airquality (dataset built-in R).
Dataset ini berisi pengukuran kualitas udara di New York (bulan
Mei–September 1973) termasuk Ozone, Temp,
Wind, Solar.R, dll.
data("airquality")
aq <- airquality # langsung saja, sudah berupa data.frame
# info singkat
summary(aq)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
# Bersihkan data
aq_clean <- aq %>%
dplyr::select(Ozone, Temp, Wind) %>%
tidyr::drop_na()
knitr::kable(head(aq_clean, 10))
| Ozone | Temp | Wind |
|---|---|---|
| 41 | 67 | 7.4 |
| 36 | 72 | 8.0 |
| 12 | 74 | 12.6 |
| 18 | 62 | 11.5 |
| 28 | 66 | 14.9 |
| 23 | 65 | 8.6 |
| 19 | 59 | 13.8 |
| 8 | 61 | 20.1 |
| 7 | 74 | 6.9 |
| 16 | 69 | 9.7 |
## 1. Model Ozone ~ Temp
# Model regresi Ozone ~ Temp
model_temp <- lm(Ozone ~ Temp, data = airquality)
summary(model_temp)
##
## Call:
## lm(formula = Ozone ~ Temp, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.729 -17.409 -0.587 11.306 118.271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -146.9955 18.2872 -8.038 9.37e-13 ***
## Temp 2.4287 0.2331 10.418 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.71 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.4877, Adjusted R-squared: 0.4832
## F-statistic: 108.5 on 1 and 114 DF, p-value: < 2.2e-16
# Scatter plot + garis regresi
plot(airquality$Temp, airquality$Ozone,
main = "Scatter Plot Ozone ~ Temp",
xlab = "Temperature (Temp)",
ylab = "Ozone",
pch = 19, col = "skyblue")
abline(model_temp, col = "red", lwd = 2)
# Hitung MSE, AIC, dan Adjusted R-squared
mse_temp <- mean(model_temp$residuals^2)
aic_temp <- AIC(model_temp)
adjr2_temp <- summary(model_temp)$adj.r.squared
mse_temp; aic_temp; adjr2_temp
## [1] 552.6715
## [1] 1067.706
## [1] 0.4832134
ggplot(aq_clean, aes(x = Temp, y = Ozone)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Ozone vs Temp", subtitle = "Model: Ozone ~ Temp",
x = "Temperature (F)", y = "Ozone (ppb)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Model regresi Ozone ~ Wind
model_wind <- lm(Ozone ~ Wind, data = airquality)
summary(model_wind)
##
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.572 -18.854 -4.868 15.234 90.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.8729 7.2387 13.38 < 2e-16 ***
## Wind -5.5509 0.6904 -8.04 9.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.47 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3619, Adjusted R-squared: 0.3563
## F-statistic: 64.64 on 1 and 114 DF, p-value: 9.272e-13
# Scatter plot + garis regresi
plot(airquality$Wind, airquality$Ozone,
main = "Scatter Plot Ozone ~ Wind",
xlab = "Wind",
ylab = "Ozone",
pch = 19, col = "lightgreen")
abline(model_wind, col = "red", lwd = 2)
# Hitung MSE, AIC, dan Adjusted R-squared
mse_wind <- mean(model_wind$residuals^2)
aic_wind <- AIC(model_wind)
adjr2_wind <- summary(model_wind)$adj.r.squared
mse_wind; aic_wind; adjr2_wind
## [1] 688.4398
## [1] 1093.187
## [1] 0.3562605
ggplot(aq_clean, aes(x = Wind, y = Ozone)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Ozone vs Wind", subtitle = "Model: Ozone ~ Wind",
x = "Wind (mph)", y = "Ozone (ppb)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Tabel perbandingan
comparison <- data.frame(
Model = c("Ozone ~ Temp", "Ozone ~ Wind"),
MSE = c(mse_temp, mse_wind),
AIC = c(aic_temp, aic_wind),
Adjusted_R2 = c(adjr2_temp, adjr2_wind)
)
knitr::kable(comparison, caption = "Perbandingan Kinerja Model Regresi")
| Model | MSE | AIC | Adjusted_R2 |
|---|---|---|---|
| Ozone ~ Temp | 552.6715 | 1067.706 | 0.4832134 |
| Ozone ~ Wind | 688.4398 | 1093.187 | 0.3562605 |
MSE (Mean Squared Error) Model dengan nilai MSE lebih kecil dianggap memberikan prediksi yang lebih akurat. Bandingkan nilai MSE dari kedua model, dan pilih yang paling kecil.
AIC (Akaike Information Criterion) Semakin kecil nilai AIC, semakin baik model dalam hal keseimbangan antara goodness-of-fit dan kompleksitas model. Bandingkan nilai AIC keduanya.
Adjusted R-squared Nilai Adjusted R-squared menunjukkan proporsi variasi Ozone yang dapat dijelaskan oleh variabel prediktor. Nilai yang lebih tinggi menandakan model lebih baik menjelaskan data.
Interpretasi Koefisien
Temp adalah prediktor yang
lebih baik dibanding Wind.Wind lebih unggul pada metrik
tersebut, maka angin lebih berpengaruh.