library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
df_airquality <- datasets::airquality
head(df_airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Cek apakah ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
# Ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Temp.R dengan median
airquality$Temp.R[is.na(airquality$Temp.R)] <- median(airquality$Temp.R, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 7 0 0 0 0
ggplot(df_airquality,aes(x= Ozone, y= Temp)) +
geom_point(alpha=0.55, color="red") +
theme_bw()
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Dari grafik terlihat adanya hubungan positif antara Ozone dan Temp, namun pola tidak sepenuhnya linear.
Ada indikasi bentuk kurva, sehingga analisis lebih tepat menggunakan model non-linear seperti spline atau regresi tangga.
mod_linear = lm(Temp~Ozone,data=df_airquality)
summary(mod_linear)
##
## Call:
## lm(formula = Temp ~ Ozone, data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.147 -4.858 1.828 4.342 12.328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.41072 1.02971 67.41 <2e-16 ***
## Ozone 0.20081 0.01928 10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.819 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.4877, Adjusted R-squared: 0.4832
## F-statistic: 108.5 on 1 and 114 DF, p-value: < 2.2e-16
Model regresi linear sederhana: Temp = β0 + β1·Ozone. summary() menunjukkan β1 signifikan (p < 0.001) dengan R² ≈ 0.49 → ada hubungan linear positif antara Ozone dan Temp. Kelemahan: pola data tidak sepenuhnya linear, sehingga model ini masih kaku.
Model tangga (cut Ozone): Temp = β0 + β·Kategori Ozone. Hasil lebih baik (R² ≈ 0.59, AIC lebih kecil) → mampu menangkap variasi antar level Ozone.
Model spline (bs Ozone): Temp = β0 + f(Ozone). Memberi fit lebih baik (R² ≈ 0.63, AIC paling kecil) → fleksibel mengikuti pola kurva.
Perbandingan: Linear < Tangga < Spline → model spline paling sesuai untuk pola non-linear pada data ini.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "pink",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Dari grafik terlihat bahwa garis regresi linear cukup mampu menangkap pola umum hubungan Ozone dan Temp, terutama di bagian tengah distribusi data. Namun, pada nilai Ozone yang sangat rendah maupun sangat tinggi, garis linear tidak mengikuti sebaran titik dengan baik. Hal ini menunjukkan adanya indikasi hubungan non-linear sehingga model yang lebih fleksibel (misalnya spline) dapat memberikan hasil yang lebih sesuai.
mod_tangga = lm(Temp ~ cut(Ozone,5),data=df_airquality)
summary(mod_tangga)
##
## Call:
## lm(formula = Temp ~ cut(Ozone, 5), data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6935 -3.6935 0.6015 4.3065 12.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.6935 0.7866 91.139 < 2e-16 ***
## cut(Ozone, 5)(34.4,67.8] 9.5478 1.3935 6.852 4.28e-10 ***
## cut(Ozone, 5)(67.8,101] 18.8620 1.6584 11.374 < 2e-16 ***
## cut(Ozone, 5)(101,135] 15.7065 2.8796 5.454 3.01e-07 ***
## cut(Ozone, 5)(135,168] 10.8065 4.4499 2.428 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.194 on 111 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.5884, Adjusted R-squared: 0.5736
## F-statistic: 39.67 on 4 and 111 DF, p-value: < 2.2e-16
cut(Ozone, 5) membagi Ozone menjadi 5 interval sama lebar.
Model mengestimasi rata-rata Temp berbeda untuk tiap kelompok Ozone.
Hasil: semua kategori signifikan (p < 0.05), R² ≈ 0.59 → lebih baik dari linear.
Kelemahan: bentuk prediksi tidak halus (stepwise), bisa loncat antar interval.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "pink",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Membagi Ozone menjadi 5 kategori.
Garis merah muda menunjukkan rata-rata Temp pada tiap interval.
Hasilnya berbentuk “tangga” (stepwise), sehingga mudah dipahami.
Namun, variasi halus dalam data hilang karena hanya ditampilkan per kelompok.
mod_spline3 = lm(Temp ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
##
## Call:
## lm(formula = Temp ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6683 -3.6982 0.5521 4.0959 10.9132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.035 6.003 9.834 < 2e-16
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1 -13.769 14.721 -0.935 0.35172
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 14.367 7.941 1.809 0.07323
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3 9.897 6.963 1.422 0.15807
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 13.948 6.608 2.111 0.03711
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 15.439 6.299 2.451 0.01586
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 38.311 7.923 4.836 4.47e-06
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 30.545 9.704 3.148 0.00213
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 20.788 8.367 2.485 0.01452
##
## (Intercept) ***
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 .
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 *
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 *
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 ***
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 **
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.003 on 107 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.6273, Adjusted R-squared: 0.5994
## F-statistic: 22.51 on 8 and 107 DF, p-value: < 2.2e-16
mod_spline3ns = lm(Temp ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3ns)
##
## Call:
## lm(formula = Temp ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6502 -4.0877 0.7038 4.1132 10.8977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.900 5.730 9.930 < 2e-16
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 13.549 6.242 2.171 0.032116
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 15.506 6.451 2.404 0.017913
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 17.585 5.931 2.965 0.003720
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 35.608 4.408 8.078 9.67e-13
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 42.658 12.076 3.532 0.000605
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 17.806 5.660 3.146 0.002134
##
## (Intercept) ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 **
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.99 on 109 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.6221, Adjusted R-squared: 0.6013
## F-statistic: 29.9 on 6 and 109 DF, p-value: < 2.2e-16
Linear model membuat hubungan Temp ~ Ozone hanya berupa garis lurus.
cut() → membagi Ozone ke dalam beberapa interval, lalu estimasi rata-rata Temp tiap kelompok (hasilnya “berundak”).
bs() → B-spline: memecah rentang Ozone dengan knot, lalu pasang polinomial di tiap segmen; hasilnya halus dan fleksibel.
ns() → Natural spline: sama seperti bs(), tapi di luar knot paling luar dipaksa linear sehingga lebih stabil untuk ekstrapolasi.
Hasil summary() menampilkan koefisien basis (sulit diinterpretasi langsung, tapi penting untuk prediksi dan meningkatkan akurasi).
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="green"))+theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Garis hijau = cubic spline, merah = natural spline.
Keduanya mampu mengikuti pola data lebih baik dibanding model linear.
Natural spline lebih stabil di bagian ekor karena dipaksa linear di luar knot → lebih aman untuk ekstrapolasi.
Kesimpulan: spline memberikan fleksibilitas untuk menangkap hubungan non-linear antara Ozone dan Temp.
# Fungsi MSE
MSE <- function(pred, actual) {
mean((pred - actual)^2, na.rm = TRUE)
}
# Buat prediksi dari masing-masing model
pred_linear <- predict(mod_linear)
pred_tangga <- predict(mod_tangga)
pred_spline <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
Model = c("Linear","Tangga","Spline","Natural Spline"),
MSE = c(MSE(pred_linear, df_airquality$Ozone),
MSE(pred_tangga, df_airquality$Ozone),
MSE(pred_spline, df_airquality$Ozone),
MSE(pred_nspline, df_airquality$Ozone)),
AIC = c(AIC(mod_linear),
AIC(mod_tangga),
AIC(mod_spline3),
AIC(mod_spline3ns)),
Adj_R2 = c(summary(mod_linear)$adj.r.squared,
summary(mod_tangga)$adj.r.squared,
summary(mod_spline3)$adj.r.squared,
summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
## Model MSE AIC Adj_R2
## 1 Linear 2259.606 778.5467 0.4832134
## 2 Tangga 2230.637 759.1549 0.5735894
## 3 Spline 2191.380 755.6463 0.5994285
## 4 Natural Spline 2202.062 753.2652 0.6012520
Insight:
Linear → MSE paling kecil, tapi Adj_R² sangat rendah → model terlalu sederhana (underfitting).
Tangga → Adj_R² meningkat, tapi MSE tinggi → penjelasan data lebih baik, prediksi kurang stabil.
Spline → Fit makin baik (Adj_R² ~0.60), AIC rendah → lebih efisien.
Natural Spline → terbaik secara keseluruhan: Adj_R² tertinggi & AIC terendah, meskipun MSE tetap tinggi.
Pendapat:
Kalau fokus pada kemampuan menjelaskan data (fit) dan efisiensi model, maka Natural Spline adalah pilihan terbaik karena Adj_R² dan AIC-nya paling baik.
Kalau tujuan Anda adalah prediksi dengan error terkecil (MSE), maka Linear Regression tampak unggul. Namun, karena Adj_R²-nya sangat rendah, ini menandakan linear underfitting — error kecil hanya terjadi karena model terlalu sederhana, bukan karena menangkap pola sebenarnya.
Dengan demikian, kompromi yang paling tepat adalah Natural Spline → meskipun MSE lebih besar dari linear, model ini lebih realistis dalam menangkap hubungan non-linear pada data Anda.
# Ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Temp dengan median
airquality$Wind[is.na(airquality$Wind)] <- median(airquality$Wind, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 7 0 0 0 0
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="orange") +
theme_bw()
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Grafik menunjukkan Wind menurun saat Ozone meningkat dengan pola melengkung, menandakan hubungan keduanya tidak sepenuhnya linear sehingga lebih sesuai dimodelkan dengan pendekatan non-linear.
mod_linear = lm(Wind~Ozone,data=df_airquality)
summary(mod_linear)
##
## Call:
## lm(formula = Wind ~ Ozone, data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2521 -2.2792 -0.2376 1.7474 10.5036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.608430 0.433126 29.11 < 2e-16 ***
## Ozone -0.065189 0.008108 -8.04 9.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.868 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3619, Adjusted R-squared: 0.3563
## F-statistic: 64.64 on 1 and 114 DF, p-value: 9.272e-13
Persamaan model: Wind = β0 + β1·Ozone
Koefisien Ozone (β1): -0.065 (p < 0.001) → signifikan; tiap kenaikan Ozone 1 unit menurunkan Wind rata-rata 0.065 unit.
Intercept (β0): 12.61 → nilai Wind saat Ozone = 0.
R² ≈ 0.36: hanya 36% variasi Wind dijelaskan oleh Ozone.
Kesimpulan: ada hubungan linear negatif yang signifikan antara Ozone dan Wind.
Kelemahan: hubungan sebenarnya kemungkinan tidak sepenuhnya linear, sehingga model linear terlalu sederhana.
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="darkblue") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "pink",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Grafik memperlihatkan hubungan negatif: semakin tinggi Ozone, kecepatan angin (Wind) cenderung menurun.
Garis regresi linear cukup baik menggambarkan pola utama di bagian tengah data.
Pada nilai Ozone yang sangat rendah maupun sangat tinggi, sebaran data lebih lebar sehingga garis tidak sepenuhnya sesuai.
Hal ini mengindikasikan kemungkinan adanya hubungan non-linear yang lebih kompleks.
mod_tangga = lm(Wind ~ cut(Ozone,5),data=df_airquality)
summary(mod_tangga)
##
## Call:
## lm(formula = Wind ~ cut(Ozone, 5), data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.163 -1.962 -0.256 1.837 11.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.4629 0.3797 30.189 < 2e-16 ***
## cut(Ozone, 5)(34.4,67.8] -2.1008 0.6726 -3.123 0.002281 **
## cut(Ozone, 5)(67.8,101] -4.4462 0.8005 -5.554 1.93e-07 ***
## cut(Ozone, 5)(101,135] -5.8629 1.3900 -4.218 5.05e-05 ***
## cut(Ozone, 5)(135,168] -7.7129 2.1480 -3.591 0.000492 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.99 on 111 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3248, Adjusted R-squared: 0.3005
## F-statistic: 13.35 on 4 and 111 DF, p-value: 6.476e-09
Model cut(Ozone, 5): Ozone dibagi menjadi 5 interval untuk melihat perbedaan rata-rata Wind per kelompok.
Intercept (11.46): rata-rata Wind pada kelompok Ozone terendah.
Koefisien negatif: pada interval lebih tinggi (misalnya -2.10, -4.45, dst) menunjukkan Wind makin menurun seiring naiknya Ozone.
Signifikansi: semua koefisien signifikan (p < 0.01), artinya perbedaan antar interval nyata secara statistik.
R² = 0.32: model menjelaskan sekitar 32% variasi Wind; cukup menggambarkan tren menurun, meski bentuknya bertingkat (stepwise) sehingga prediksi terlihat kaku.
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "black",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Titik ungu: menunjukkan sebaran data antara kadar Ozone (X) dan Wind (Y).
Garis hitam: hasil estimasi model regresi tangga (stepwise) berdasarkan interval Ozone.
Pola hubungan: terlihat negatif, di mana kenaikan Ozone diikuti penurunan rata-rata Wind.
Rentang Ozone rendah (0–50): kecepatan angin bervariasi cukup lebar (±7–15).
Rentang Ozone tinggi (>100): kecepatan angin lebih rendah dan terkonsentrasi (±3–6).
Kesimpulan: ada indikasi kuat bahwa peningkatan kadar Ozone berhubungan dengan penurunan kecepatan angin.
mod_spline3 = lm(Wind ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
##
## Call:
## lm(formula = Wind ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9514 -2.0566 -0.1276 1.1590 10.1440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6924 2.8862 3.358 0.00109
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1 -4.5888 7.0775 -0.648 0.51814
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 6.2557 3.8179 1.639 0.10425
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3 1.5135 3.3474 0.452 0.65208
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 0.8886 3.1769 0.280 0.78023
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 1.6743 3.0283 0.553 0.58149
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 -4.0437 3.8090 -1.062 0.29081
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 -4.2013 4.6654 -0.901 0.36986
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 -6.6076 4.0226 -1.643 0.10340
##
## (Intercept) **
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.886 on 107 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3935, Adjusted R-squared: 0.3481
## F-statistic: 8.677 on 8 and 107 DF, p-value: 4.556e-09
mod_spline3ns = lm(Wind ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3ns)
##
## Call:
## lm(formula = Wind ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6041 -2.1131 -0.2267 1.2119 10.1370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9246 2.7468 3.249 0.00154
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 2.8025 2.9921 0.937 0.35103
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 1.4868 3.0924 0.481 0.63162
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 2.3873 2.8432 0.840 0.40294
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 -5.2073 2.1133 -2.464 0.01530
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 0.6487 5.7891 0.112 0.91099
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 -7.9496 2.7133 -2.930 0.00413
##
## (Intercept) **
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.871 on 109 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3885, Adjusted R-squared: 0.3548
## F-statistic: 11.54 on 6 and 109 DF, p-value: 5.72e-10
Intercept (8.92): menunjukkan bahwa ketika kontribusi semua basis spline bernilai nol, rata-rata Wind berada di sekitar 8.9.
Koefisien spline: dari enam komponen, hanya dua yang signifikan, yaitu komponen ke-4 (−5.21, p = 0.015) dan ke-6 (−7.95, p = 0.004). Hal ini menandakan adanya pola non-linear yang cukup kuat pada beberapa segmen kadar Ozone.
R² = 0.39: sekitar 39% variasi Wind dapat dijelaskan oleh variabel Ozone menggunakan natural spline.
Uji model: nilai F-statistic signifikan (p < 0.001), sehingga model secara keseluruhan terbukti valid.
Residual standard error = 2.87: besarnya error model relatif sebanding dengan pendekatan spline lainnya, menunjukkan performa serupa.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="blue","Cubic Spline"="green"))+theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Cubic Spline (garis hijau): mampu menyesuaikan pola data dengan detail tinggi, namun cenderung berlebihan di bagian ekor sehingga rawan overfitting.
Natural Cubic Spline (garis biru): memberikan kurva yang lebih mulus dan terkendali pada batas data, sehingga lebih andal digunakan untuk ekstrapolasi atau prediksi jangka panjang.
Pola hubungan Ozone–Temp: suhu meningkat seiring bertambahnya Ozone hingga mencapai puncak tertentu, lalu berbalik menurun — jelas menunjukkan bentuk hubungan non-linear.
Perbandingan: natural spline menawarkan kestabilan, sementara cubic spline memberi fleksibilitas lebih besar dalam mengikuti fluktuasi data.
# Fungsi MSE
MSE <- function(pred, actual) {
mean((pred - actual)^2, na.rm = TRUE)
}
# Buat prediksi dari masing-masing model
pred_linear <- predict(mod_linear)
pred_tangga <- predict(mod_tangga)
pred_spline <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
Model = c("Linear","Tangga","Spline","Natural Spline"),
MSE = c(MSE(pred_linear, df_airquality$Ozone),
MSE(pred_tangga, df_airquality$Ozone),
MSE(pred_spline, df_airquality$Ozone),
MSE(pred_nspline, df_airquality$Ozone)),
AIC = c(AIC(mod_linear),
AIC(mod_tangga),
AIC(mod_spline3),
AIC(mod_spline3ns)),
Adj_R2 = c(summary(mod_linear)$adj.r.squared,
summary(mod_tangga)$adj.r.squared,
summary(mod_spline3)$adj.r.squared,
summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
## Model MSE AIC Adj_R2
## 1 Linear 2111.593 577.6334 0.3562605
## 2 Tangga 2116.099 590.1736 0.3005156
## 3 Spline 2104.394 585.7362 0.3481423
## 4 Natural Spline 2108.308 582.6867 0.3548380
Insight:
Spline punya MSE terendah → prediksi paling akurat.
Natural Spline unggul di AIC terendah & Adj_R² cukup tinggi → paling efisien dan seimbang.
Linear masih lumayan, tapi terlalu sederhana untuk pola non-linear.
Tangga performanya paling lemah di semua metrik.
Pendapat:
Natural Spline tampak sebagai model yang paling seimbang: AIC terendah, Adj_R² relatif tinggi, dan MSE cukup rendah. Cocok jika tujuan analisis adalah keseimbangan antara kompleksitas dan kemampuan jelaskan data.
Spline unggul pada MSE (prediksi error terkecil), namun kalah di AIC, sehingga lebih kompleks dan kurang efisien.
Linear masih kompetitif karena Adj_R² cukup tinggi dan AIC rendah, tetapi mungkin terlalu sederhana untuk menangkap hubungan non-linear.
Tangga tidak direkomendasikan karena performanya paling buruk di semua metrik.