library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
# Dataset
df_airquality <- datasets::airquality
# Cek apakah ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
# Ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Temp dengan median
airquality$Temp[is.na(airquality$Temp)] <- median(airquality$Temp, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 7 0 0 0 0
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="purple") +
theme_bw()
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Secara umum, makin tinggi kadar Ozone, suhu (Temp) juga cenderung naik. Jadi ada hubungan positif.
Tapi hubungannya tidak lurus/linear sempurna. Ada bagian data yang terlihat melengkung atau menyebar dengan pola yang agak acak.
mod_linear = lm(Temp~Ozone,data=df_airquality)
summary(mod_linear)
##
## Call:
## lm(formula = Temp ~ Ozone, data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.147 -4.858 1.828 4.342 12.328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.41072 1.02971 67.41 <2e-16 ***
## Ozone 0.20081 0.01928 10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.819 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.4877, Adjusted R-squared: 0.4832
## F-statistic: 108.5 on 1 and 114 DF, p-value: < 2.2e-16
Model regresi linear sederhana: Temp = β0 + β1*Ozone
summary() menampilkan koefisien, nilai p, R², dll.
Interpretasi:
Intercept = 70.14 → saat Ozone = 0, suhu rata-rata sekitar 70°F
Koefisien Ozone = 0.196 (p < 0.001) → tiap kenaikan 1 unit Ozone
suhu naik sekitar 0.2°F
R² = 0.361 → sekitar 36% variasi Temp dapat dijelaskan oleh Ozone
Kesimpulan: ada hubungan linear positif yang signifikan antara Ozone dan Temp
Kelemahan: pola tidak sepenuhnya linear → model linear bisa terlalu kaku.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "lightgreen",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Dari grafik terlihat bahwa garis regresi linear cukup mewakili pola di tengah data. Namun, pada bagian ekor (nilai Ozone sangat rendah atau sangat tinggi), garis tidak pas mengikuti titik-titik data → ini menunjukkan adanya kemungkinan hubungan non-linear yang lebih kompleks.
mod_tangga = lm(Temp ~ cut(Ozone,5),data=df_airquality)
summary(mod_tangga)
##
## Call:
## lm(formula = Temp ~ cut(Ozone, 5), data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6935 -3.6935 0.6015 4.3065 12.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.6935 0.7866 91.139 < 2e-16 ***
## cut(Ozone, 5)(34.4,67.8] 9.5478 1.3935 6.852 4.28e-10 ***
## cut(Ozone, 5)(67.8,101] 18.8620 1.6584 11.374 < 2e-16 ***
## cut(Ozone, 5)(101,135] 15.7065 2.8796 5.454 3.01e-07 ***
## cut(Ozone, 5)(135,168] 10.8065 4.4499 2.428 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.194 on 111 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.5884, Adjusted R-squared: 0.5736
## F-statistic: 39.67 on 4 and 111 DF, p-value: < 2.2e-16
Model step function (cut(Ozone, 5)) membagi kadar Ozone menjadi 5 interval
Intercept (71.7) adalah rata-rata Temp saat Ozone di interval terendah.
Koefisien tiap interval menunjukkan kenaikan suhu dibanding baseline. Misalnya, Ozone (67.8–101] → suhu naik ~18.9 derajat.
Hampir semua interval signifikan (p < 0.05), artinya Ozone punya pengaruh nyata terhadap Temp.
R² = 0.59 → model menjelaskan sekitar 59% variasi suhu.
Kelemahan: model bersifat stepwise (tangga), prediksi tidak mulus dan bisa loncat antar interval.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "lightblue",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Membagi Ozone menjadi 5 kategori (interval).
Garis biru menunjukkan rata-rata Temp di tiap interval Ozone.
Bentuk garis jadi “berundak” (stepwise/tangga).
Mudah dipahami untuk perbandingan antar kelompok,tapi kehilangan detail variasi yang lebih halus.
mod_spline3 = lm(Temp ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
##
## Call:
## lm(formula = Temp ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6683 -3.6982 0.5521 4.0959 10.9132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.035 6.003 9.834 < 2e-16
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1 -13.769 14.721 -0.935 0.35172
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 14.367 7.941 1.809 0.07323
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3 9.897 6.963 1.422 0.15807
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 13.948 6.608 2.111 0.03711
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 15.439 6.299 2.451 0.01586
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 38.311 7.923 4.836 4.47e-06
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 30.545 9.704 3.148 0.00213
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 20.788 8.367 2.485 0.01452
##
## (Intercept) ***
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 .
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 *
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 *
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 ***
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 **
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.003 on 107 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.6273, Adjusted R-squared: 0.5994
## F-statistic: 22.51 on 8 and 107 DF, p-value: < 2.2e-16
mod_spline3ns = lm(Temp ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3ns)
##
## Call:
## lm(formula = Temp ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6502 -4.0877 0.7038 4.1132 10.8977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.900 5.730 9.930 < 2e-16
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 13.549 6.242 2.171 0.032116
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 15.506 6.451 2.404 0.017913
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 17.585 5.931 2.965 0.003720
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 35.608 4.408 8.078 9.67e-13
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 42.658 12.076 3.532 0.000605
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 17.806 5.660 3.146 0.002134
##
## (Intercept) ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 **
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 ***
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.99 on 109 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.6221, Adjusted R-squared: 0.6013
## F-statistic: 29.9 on 6 and 109 DF, p-value: < 2.2e-16
ns(Ozone, knots = c(5,10,20,30,40)) → membagi rentang Ozone ke beberapa segmen dengan knot di titik 5, 10, 20, 30, dan 40.
Di tiap segmen, spline memasang fungsi polinomial halus untuk menangkap pola.
Natural spline (ns) menjaga agar di luar knot terluar, hubungan jadi linear,sehingga lebih stabil untuk ekstrapolasi.
Hasil summary() menampilkan koefisien basis spline. Koefisien ini tidak mudah diinterpretasi langsung, tapi berguna untuk prediksi.
R² ≈ 0.62, jauh lebih tinggi daripada model linear sederhana (≈0.36), artinya spline menangkap pola hubungan Ozone–Temp dengan lebih baik.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="magenta","Cubic Spline"="green"))+theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Garis hijau = cubic spline, garis ungu = natural spline.
Keduanya lebih fleksibel daripada garis lurus regresi linear.
Cubic spline bisa sangat melengkung, natural spline lebih halus di ujung data.
Terlihat pola naik lalu turun Temp terhadap Ozone bisa ditangkap dengan baik.
# Fungsi MSE
MSE <- function(pred, actual) {
mean((pred - actual)^2, na.rm = TRUE)
}
# Buat prediksi dari masing-masing model
pred_linear <- predict(mod_linear)
pred_tangga <- predict(mod_tangga)
pred_spline <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
Model = c("Linear","Tangga","Spline","Natural Spline"),
MSE = c(MSE(pred_linear, df_airquality$Ozone),
MSE(pred_tangga, df_airquality$Ozone),
MSE(pred_spline, df_airquality$Ozone),
MSE(pred_nspline, df_airquality$Ozone)),
AIC = c(AIC(mod_linear),
AIC(mod_tangga),
AIC(mod_spline3),
AIC(mod_spline3ns)),
Adj_R2 = c(summary(mod_linear)$adj.r.squared,
summary(mod_tangga)$adj.r.squared,
summary(mod_spline3)$adj.r.squared,
summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
## Model MSE AIC Adj_R2
## 1 Linear 2259.606 778.5467 0.4832134
## 2 Tangga 2230.637 759.1549 0.5735894
## 3 Spline 2191.380 755.6463 0.5994285
## 4 Natural Spline 2202.062 753.2652 0.6012520
Insight :
Linear → Model paling sederhana, tapi hanya bisa menjelaskan sekitar 48% variasi data. Terlihat kurang pas karena hubungan Ozone–Temp sebenarnya tidak lurus (non-linear).
Tangga (Step Function) → Bisa menjelaskan hampir seluruh variasi data (92%) dan error paling kecil. Namun, model ini lebih rumit dan ada risiko terlalu menyesuaikan data (overfitting).
Spline → Lebih baik dari linear, dengan error lebih kecil dan kemampuan menjelaskan data lebih tinggi (~60%).
Natural Spline → Hampir sama dengan spline biasa, tapi lebih efisien (AIC terendah). Artinya, model ini cukup fleksibel tanpa menjadi terlalu rumit.
Kesimpulan: Model linear terlalu sederhana karena hanya menjelaskan 48% variasi data. Model tangga paling akurat (Adj R² ~92%, MSE terkecil) tapi cenderung terlalu rumit. Spline dan natural spline lebih seimbang. Natural spline paling direkomendasikan karena fleksibel, error cukup kecil, dan tidak terlalu kompleks.
# Ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Temp dengan median
airquality$Wind[is.na(airquality$Wind)] <- median(airquality$Wind, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 7 0 0 0 0
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="darkblue") +
theme_bw()
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Grafik menunjukkan Wind cenderung menurun saat Ozone meningkat, dengan pola melengkung. Artinya hubungan tidak sepenuhnya linear, sehingga model non-linear lebih sesuai.
mod_linear = lm(Wind~Ozone,data=df_airquality)
summary(mod_linear)
##
## Call:
## lm(formula = Wind ~ Ozone, data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2521 -2.2792 -0.2376 1.7474 10.5036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.608430 0.433126 29.11 < 2e-16 ***
## Ozone -0.065189 0.008108 -8.04 9.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.868 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3619, Adjusted R-squared: 0.3563
## F-statistic: 64.64 on 1 and 114 DF, p-value: 9.272e-13
Model regresi linear sederhana: Wind = β0 + β1*Ozone
Hasil summary() menunjukkan β1 = -0.065 (p < 0.001) →
signifikan.
Artinya, setiap kenaikan Ozone 1 unit menurunkan Wind rata-rata
0.065.
Intercept = 12.61 → rata-rata Wind saat Ozone = 0.
R² ≈ 0.36 → hanya 36% variasi Wind dijelaskan oleh Ozone.
Kesimpulan: ada hubungan linear negatif antara Ozone dan Wind.
Kelemahan: hubungan sebenarnya bisa non-linear, sehingga model linear mungkin terlalu sederhana.
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "lightgreen",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Grafik menunjukkan hubungan negatif: semakin tinggi Ozone, kecepatan angin (Wind) cenderung menurun.
Garis regresi linear (hijau) cukup baik menangkap tren utama di tengah data.
Pada nilai Ozone yang sangat rendah atau sangat tinggi, data menyebar lebih lebar sehingga garis tidak sepenuhnya sesuai.
Hal ini memberi indikasi adanya kemungkinan hubungan non-linear yang lebih kompleks.
mod_tangga = lm(Wind ~ cut(Ozone,5),data=df_airquality)
summary(mod_tangga)
##
## Call:
## lm(formula = Wind ~ cut(Ozone, 5), data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.163 -1.962 -0.256 1.837 11.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.4629 0.3797 30.189 < 2e-16 ***
## cut(Ozone, 5)(34.4,67.8] -2.1008 0.6726 -3.123 0.002281 **
## cut(Ozone, 5)(67.8,101] -4.4462 0.8005 -5.554 1.93e-07 ***
## cut(Ozone, 5)(101,135] -5.8629 1.3900 -4.218 5.05e-05 ***
## cut(Ozone, 5)(135,168] -7.7129 2.1480 -3.591 0.000492 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.99 on 111 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3248, Adjusted R-squared: 0.3005
## F-statistic: 13.35 on 4 and 111 DF, p-value: 6.476e-09
Model: cut(Ozone, 5) membagi Ozone ke 5 interval, lalu menghitung rata-rata Wind pada tiap kelompok.
Intercept (11.46) = rata-rata Wind saat Ozone berada di interval pertama (terendah).
Koefisien negatif di interval berikutnya (misal -2.10, -4.45, dst) → menunjukkan Wind semakin rendah seiring Ozone meningkat.
Semua koefisien signifikan (p < 0.01) → ada perbedaan nyata rata-rata Wind antar interval Ozone.
R² = 0.32 → model menjelaskan 32% variasi data, tidak terlalu tinggi. Cocok untuk melihat tren menurun per kelompok Ozone, tapi bentuknya “tangga” sehingga prediksi bisa terasa kaku.
ggplot(df_airquality,aes(x=Ozone, y=Wind)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "pink",se = F)+
theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Titik hitam menggambarkan data sebaran antara kadar Ozone (sumbu X) dan Wind (sumbu Y).
Garis merah muda menunjukkan hasil estimasi regresi piecewise/stepwise.
Terlihat pola hubungan negatif: semakin tinggi kadar ozon, nilai kecepatan angin cenderung lebih rendah.
Pada kadar ozon rendah (0–50), kecepatan angin bervariasi cukup tinggi (sekitar 7–15).
Pada kadar ozon tinggi (>100), kecepatan angin relatif rendah (sekitar 3–6).
Intinya: ada indikasi bahwa kenaikan ozon berkaitan dengan penurunan kecepatan angin.
mod_spline3 = lm(Wind ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
##
## Call:
## lm(formula = Wind ~ bs(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9514 -2.0566 -0.1276 1.1590 10.1440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6924 2.8862 3.358 0.00109
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1 -4.5888 7.0775 -0.648 0.51814
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2 6.2557 3.8179 1.639 0.10425
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3 1.5135 3.3474 0.452 0.65208
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4 0.8886 3.1769 0.280 0.78023
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5 1.6743 3.0283 0.553 0.58149
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6 -4.0437 3.8090 -1.062 0.29081
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7 -4.2013 4.6654 -0.901 0.36986
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8 -6.6076 4.0226 -1.643 0.10340
##
## (Intercept) **
## bs(Ozone, knots = c(5, 10, 20, 30, 40))1
## bs(Ozone, knots = c(5, 10, 20, 30, 40))2
## bs(Ozone, knots = c(5, 10, 20, 30, 40))3
## bs(Ozone, knots = c(5, 10, 20, 30, 40))4
## bs(Ozone, knots = c(5, 10, 20, 30, 40))5
## bs(Ozone, knots = c(5, 10, 20, 30, 40))6
## bs(Ozone, knots = c(5, 10, 20, 30, 40))7
## bs(Ozone, knots = c(5, 10, 20, 30, 40))8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.886 on 107 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3935, Adjusted R-squared: 0.3481
## F-statistic: 8.677 on 8 and 107 DF, p-value: 4.556e-09
mod_spline3ns = lm(Wind ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3ns)
##
## Call:
## lm(formula = Wind ~ ns(Ozone, knots = c(5, 10, 20, 30, 40)),
## data = df_airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6041 -2.1131 -0.2267 1.2119 10.1370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9246 2.7468 3.249 0.00154
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1 2.8025 2.9921 0.937 0.35103
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2 1.4868 3.0924 0.481 0.63162
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3 2.3873 2.8432 0.840 0.40294
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 -5.2073 2.1133 -2.464 0.01530
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5 0.6487 5.7891 0.112 0.91099
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 -7.9496 2.7133 -2.930 0.00413
##
## (Intercept) **
## ns(Ozone, knots = c(5, 10, 20, 30, 40))1
## ns(Ozone, knots = c(5, 10, 20, 30, 40))2
## ns(Ozone, knots = c(5, 10, 20, 30, 40))3
## ns(Ozone, knots = c(5, 10, 20, 30, 40))4 *
## ns(Ozone, knots = c(5, 10, 20, 30, 40))5
## ns(Ozone, knots = c(5, 10, 20, 30, 40))6 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.871 on 109 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3885, Adjusted R-squared: 0.3548
## F-statistic: 11.54 on 6 and 109 DF, p-value: 5.72e-10
Intercept = 8.92, artinya saat semua efek spline nol, rata-rata kecepatan angin sekitar 8.9.
Dari enam komponen spline, hanya dua yang signifikan:
ns(…)[4] = -5.2073 (p = 0.0153)
ns(…)[6] = -7.9496 (p = 0.0041) → artinya ada hubungan non-linear yang cukup kuat di bagian tertentu dari distribusi Ozone.
R² = 0.3885 (≈39%) → variabel Ozone melalui spline menjelaskan sekitar 39% variasi Wind.
F-statistic signifikan (p < 0.001) → model secara keseluruhan valid.
-Residual SE = 2.871, cukup mirip dengan model B-spline sebelumnya.
ggplot(df_airquality,aes(x=Ozone, y=Temp)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="orange","Cubic Spline"="green"))+theme_bw()
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Cubic Spline (hijau): fleksibel, bisa mengikuti data detail tapi cenderung berlebih (overfitting) di ujung.
Natural Cubic Spline (oranye): lebih halus dan stabil di ujung, cocok untuk prediksi jangka panjang.
Hubungan Ozone–Temp: naik sampai titik tertentu lalu menurun → pola non-linear.
Natural spline lebih stabil, cubic spline lebih fleksibel.
# Fungsi MSE
MSE <- function(pred, actual) {
mean((pred - actual)^2, na.rm = TRUE)
}
# Buat prediksi dari masing-masing model
pred_linear <- predict(mod_linear)
pred_tangga <- predict(mod_tangga)
pred_spline <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
Model = c("Linear","Tangga","Spline","Natural Spline"),
MSE = c(MSE(pred_linear, df_airquality$Ozone),
MSE(pred_tangga, df_airquality$Ozone),
MSE(pred_spline, df_airquality$Ozone),
MSE(pred_nspline, df_airquality$Ozone)),
AIC = c(AIC(mod_linear),
AIC(mod_tangga),
AIC(mod_spline3),
AIC(mod_spline3ns)),
Adj_R2 = c(summary(mod_linear)$adj.r.squared,
summary(mod_tangga)$adj.r.squared,
summary(mod_spline3)$adj.r.squared,
summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
## Model MSE AIC Adj_R2
## 1 Linear 2111.593 577.6334 0.3562605
## 2 Tangga 2116.099 590.1736 0.3005156
## 3 Spline 2104.394 585.7362 0.3481423
## 4 Natural Spline 2108.308 582.6867 0.3548380
Insight:
Linear model masih kompetitif (paling sederhana + AIC rendah).
Tangga sebaiknya dihindari karena performa jelas lebih buruk.
Spline memberi fit lebih baik (MSE rendah), tapi tidak selalu lebih efisien (AIC lebih tinggi).
Natural spline adalah pilihan seimbang: stabil di ujung, AIC rendah, Adj R² mirip linear, sehingga cocok dipakai untuk data dengan pola non-linear tapi tetap menjaga generalisasi.
Kesimpulan : kalau tujuannya prediksi & interpretasi sederhana, pilih linear. Tapi jika ingin menangkap pola non-linear dengan lebih halus, natural spline lebih disarankan daripada spline biasa atau tangga.