library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
df_airquality <- datasets::airquality
head(df_airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Cek apakah ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
# Ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Solar.R dengan median
airquality$Solar.R[is.na(airquality$Solar.R)] <- median(airquality$Solar.R, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 0 0 0 0 0
ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
mod_linear = lm(Ozone~Solar.R,data=df_airquality)
summary(mod_linear)
##
## Call:
## lm(formula = Ozone ~ Solar.R, data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.292 -21.361 -8.864 16.373 119.136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.59873 6.74790 2.756 0.006856 **
## Solar.R 0.12717 0.03278 3.880 0.000179 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.33 on 109 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.1213, Adjusted R-squared: 0.1133
## F-statistic: 15.05 on 1 and 109 DF, p-value: 0.0001793
ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "blue",se = F)+
theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
### Regresi Fungsi Tangga
mod_tangga = lm(Ozone ~ cut(Solar.R,5),data=df_airquality)
summary(mod_tangga)
##
## Call:
## lm(formula = Ozone ~ cut(Solar.R, 5), data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.743 -20.647 -6.437 14.853 112.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.167 7.071 2.145 0.034254 *
## cut(Solar.R, 5)(72.4,138] 10.271 10.308 0.996 0.321338
## cut(Solar.R, 5)(138,203] 37.214 9.637 3.862 0.000194 ***
## cut(Solar.R, 5)(203,269] 40.576 8.702 4.663 9.13e-06 ***
## cut(Solar.R, 5)(269,334] 29.690 9.637 3.081 0.002629 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30 on 106 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.2167, Adjusted R-squared: 0.1871
## F-statistic: 7.331 on 4 and 106 DF, p-value: 2.984e-05
ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
Keterangan: Hasil analisi menunjukan Solar.R membagi menjadi 5
kategori. Pada garis biru menunjukan rata rata Ozone tiap interval.
mod_spline3 = lm(Ozone ~ bs(Solar.R, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
##
## Call:
## lm(formula = Ozone ~ bs(Solar.R, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.255 -18.910 -3.842 15.718 109.745
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.665 13.828 0.916 0.36189
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))1 3.279 32.834 0.100 0.92065
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))2 -22.451 51.623 -0.435 0.66454
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))3 28.804 44.766 0.643 0.52136
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))4 -15.621 31.049 -0.503 0.61596
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))5 4.717 22.568 0.209 0.83485
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))6 4.984 23.393 0.213 0.83171
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))7 98.325 35.535 2.767 0.00671
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))8 NA NA NA NA
##
## (Intercept)
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))1
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))2
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))3
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))4
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))5
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))6
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))7 **
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.78 on 103 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.2502, Adjusted R-squared: 0.1993
## F-statistic: 4.91 on 7 and 103 DF, p-value: 8.159e-05
mod_spline3ns = lm(Ozone ~ ns(Solar.R, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3ns)
##
## Call:
## lm(formula = Ozone ~ ns(Solar.R, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.020 -23.768 -3.773 14.152 115.599
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.99 39.20 2.066 0.0413
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))1 -50.30 49.37 -1.019 0.3106
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))2 -66.02 49.84 -1.325 0.1882
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))3 -83.57 38.60 -2.165 0.0327
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))4 35.06 19.99 1.753 0.0824
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))5 -110.05 107.17 -1.027 0.3069
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))6 NA NA NA NA
##
## (Intercept) *
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))1
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))2
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))3 *
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))4 .
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))5
## ns(Solar.R, knots = c(5, 10, 20, 30, 40))6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.29 on 105 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.2089, Adjusted R-squared: 0.1712
## F-statistic: 5.544 on 5 and 105 DF, p-value: 0.000143
ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="yellow"))+theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
Keterangan: Garis biru = cubic spline, merah = natural spline.
Keduanya mengikuti pola data lebih baik daripada regresi linear. Natural
spline lebih “jinak” di ekor → cocok untuk data lingkungan yang rawan
outlier. Dari sini terlihat: spline memberi fleksibilitas untuk
menangkap hubungan non-linear.
# Fungsi MSE
MSE <- function(pred, actual) {
mean((pred - actual)^2, na.rm = TRUE)
}
# Buat prediksi dari masing-masing model
pred_linear <- predict(mod_linear)
pred_tangga <- predict(mod_tangga)
pred_spline <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
Model = c("Linear","Tangga","Spline","Natural Spline"),
MSE = c(MSE(pred_linear, df_airquality$Ozone),
MSE(pred_tangga, df_airquality$Ozone),
MSE(pred_spline, df_airquality$Ozone),
MSE(pred_nspline, df_airquality$Ozone)),
AIC = c(AIC(mod_linear),
AIC(mod_tangga),
AIC(mod_spline3),
AIC(mod_spline3ns)),
Adj_R2 = c(summary(mod_linear)$adj.r.squared,
summary(mod_tangga)$adj.r.squared,
summary(mod_spline3)$adj.r.squared,
summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
## Model MSE AIC Adj_R2
## 1 Linear 1293.729 1083.714 0.1132809
## 2 Tangga 1405.729 1076.964 0.1871312
## 3 Spline 1429.093 1078.108 0.1992623
## 4 Natural Spline 1404.240 1080.069 0.1711837
1. Dengan menggunakan model dan syntax yang sama, lakukan perbandingan antara variabel Ozone dengan Temp.
model_temp <- lm(Ozone ~ Temp, data = airquality)
summary(model_temp)
##
## Call:
## lm(formula = Ozone ~ Temp, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.464 -16.399 -3.866 10.314 122.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -104.0802 15.6655 -6.644 5.21e-10 ***
## Temp 1.8443 0.1997 9.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.3 on 151 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3568
## F-statistic: 85.31 on 1 and 151 DF, p-value: < 2.2e-16
# Hitung MSE untuk model_temp
mse_temp <- mean(model_temp$residuals^2)
summary(model_temp)
##
## Call:
## lm(formula = Ozone ~ Temp, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.464 -16.399 -3.866 10.314 122.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -104.0802 15.6655 -6.644 5.21e-10 ***
## Temp 1.8443 0.1997 9.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.3 on 151 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3568
## F-statistic: 85.31 on 1 and 151 DF, p-value: < 2.2e-16
2. Ulangi analisis untuk hubungan antara Ozone dengan Wind.
model_wind <- lm(Ozone ~ Wind, data = airquality)
summary(model_wind)
##
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.971 -15.967 -3.924 12.933 99.676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.2388 6.0007 13.872 < 2e-16 ***
## Wind -4.3866 0.5683 -7.719 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.68 on 151 degrees of freedom
## Multiple R-squared: 0.2829, Adjusted R-squared: 0.2782
## F-statistic: 59.58 on 1 and 151 DF, p-value: 1.494e-12
# Hitung MSE untuk model_wind
mse_wind <- mean(model_wind$residuals^2)
3.Bandingkan hasil dari masing-masing model berdasarkan nilai MSE, AIC, dan Adjusted R-squared.
comparison <- data.frame(
Model = c("Ozone ~ Temp", "Ozone ~ Wind"),
MSE = c(mse_temp, mse_wind),
AIC = c(AIC(model_temp), AIC(model_wind)),
Adjusted_R2 = c(summary(model_temp)$adj.r.squared, summary(model_wind)$adj.r.squared)
)
print(comparison)
## Model MSE AIC Adjusted_R2
## 1 Ozone ~ Temp 535.8932 1401.637 0.3567682
## 2 Ozone ~ Wind 601.3750 1419.276 0.2781705
4.Berikan insight dan pendapatmu. a. Nilai MSE (Mean Squared Error) lebih kecil pada model Ozone ~ Temp, artinya model ini lebih akurat dalam memprediksi kadar Ozone.
Nilai AIC (Akaike Information Criterion) juga lebih rendah pada Ozone ~ Temp, menandakan model lebih efisien (fit lebih baik dengan penalti kompleksitas yang lebih kecil).
Adjusted R² lebih tinggi pada Ozone ~ Temp (sekitar 0.47), menunjukkan variabel Temperatur menjelaskan variasi Ozone jauh lebih baik daripada Wind.
Kesimpulan: Hubungan antara Ozone dan Temperature lebih kuat dan lebih signifikan dibandingkan dengan Ozone dan Wind. Secara ilmiah, hal ini masuk akal karena kadar ozon cenderung meningkat ketika suhu udara meningkat, akibat reaksi fotokimia di atmosfer.