import library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
df_airquality <- datasets::airquality
head(df_airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

Cek missing value

colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0

berdasarkan hasil pemeriksaan, dataset airquality memiliki missing value pada variabel Ozone dan Solar R

# Mengganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)

# Mengganti NA pada kolom Solar.R dengan median
airquality$Solar.R[is.na(airquality$Solar.R)] <- median(airquality$Solar.R, na.rm = TRUE)

# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##       0       0       0       0       0       0

Data Air Quality

ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
                 geom_point(alpha=0.55, color="red") + 
                 theme_bw()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

Berdasarkan dari grafik terlihat hubungan tidak sepenuhnya linear → ada indikasi kurva. Hal ini menjadi dasar kita perlu coba model non-linear (tangga, spline). # Regresi linear

mod_linear = lm(Ozone~Solar.R,data=df_airquality)
summary(mod_linear)
## 
## Call:
## lm(formula = Ozone ~ Solar.R, data = df_airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.292 -21.361  -8.864  16.373 119.136 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.59873    6.74790   2.756 0.006856 ** 
## Solar.R      0.12717    0.03278   3.880 0.000179 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.33 on 109 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.1213, Adjusted R-squared:  0.1133 
## F-statistic: 15.05 on 1 and 109 DF,  p-value: 0.0001793

Intrepetasi:

  1. β₁ (0.127, p < 0.001) → signifikan → ada hubungan linear positif antara Solar.R dan Ozone. Artinya, setiap kenaikan 1 satuan Solar.R meningkatkan Ozone rata-rata sebesar 0.127.

  2. Intercept (18.60, p < 0.01) → saat Solar.R = 0, kadar Ozone diperkirakan 18.6.

  3. R² = 0.121 → hanya sekitar 12% variasi Ozone yang bisa dijelaskan oleh Solar.R → model masih lemah.

  4. F-statistic signifikan → model secara keseluruhan layak digunakan, meski penjelasan variansnya kecil.

ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1, col = "blue",se = F)+
  theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

Dari grafik terlihat bahwa linear fit cukup oke di bagian tengah, tapi di ekor (nilai kecil & besar) tidak mengikuti pola dengan baik → indikasi perlu model lebih fleksibel.

Regresi Fungsi Tangga

mod_tangga = lm(Ozone ~ cut(Solar.R,5),data=df_airquality)
summary(mod_tangga)
## 
## Call:
## lm(formula = Ozone ~ cut(Solar.R, 5), data = df_airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.743 -20.647  -6.437  14.853 112.257 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 15.167      7.071   2.145 0.034254 *  
## cut(Solar.R, 5)(72.4,138]   10.271     10.308   0.996 0.321338    
## cut(Solar.R, 5)(138,203]    37.214      9.637   3.862 0.000194 ***
## cut(Solar.R, 5)(203,269]    40.576      8.702   4.663 9.13e-06 ***
## cut(Solar.R, 5)(269,334]    29.690      9.637   3.081 0.002629 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30 on 106 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.2167, Adjusted R-squared:  0.1871 
## F-statistic: 7.331 on 4 and 106 DF,  p-value: 2.984e-05

cut(Solar.R, 3) membagi Solar.R ke dalam 3 interval sama lebar. Artinya model mengestimasi rata-rata Ozone berbeda-beda untuk tiap interval Solar.R. Cocok kalau kita hanya ingin melihat perbedaan rata-rata antar kelompok.

Kelemahan: tidak halus (stepwise), prediksi bisa “loncat-loncat”.

ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Regresi Spline

mod_spline3 = lm(Ozone ~ bs(Solar.R, knots = c(5, 10, 20, 30, 40)),data=df_airquality)
summary(mod_spline3)
## 
## Call:
## lm(formula = Ozone ~ bs(Solar.R, knots = c(5, 10, 20, 30, 40)), 
##     data = df_airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.255 -18.910  -3.842  15.718 109.745 
## 
## Coefficients: (1 not defined because of singularities)
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                  12.665     13.828   0.916  0.36189
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))1    3.279     32.834   0.100  0.92065
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))2  -22.451     51.623  -0.435  0.66454
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))3   28.804     44.766   0.643  0.52136
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))4  -15.621     31.049  -0.503  0.61596
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))5    4.717     22.568   0.209  0.83485
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))6    4.984     23.393   0.213  0.83171
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))7   98.325     35.535   2.767  0.00671
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))8       NA         NA      NA       NA
##                                              
## (Intercept)                                  
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))1   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))2   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))3   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))4   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))5   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))6   
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))7 **
## bs(Solar.R, knots = c(5, 10, 20, 30, 40))8   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.78 on 103 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.2502, Adjusted R-squared:  0.1993 
## F-statistic:  4.91 on 7 and 103 DF,  p-value: 8.159e-05

Spline memecah rentang Solar.R menjadi beberapa segmen dengan knot (titik belok), lalu pasang polinomial di tiap segmen.

bs() → B-spline: fleksibel, bisa melengkung mengikuti data.

ns() → Natural spline: mirip, tapi di luar knot paling luar dipaksa linear (lebih stabil untuk ekstrapolasi).

quantile() dipakai supaya knot sesuai distribusi data (tidak sembarang angka).

Hasil summary() menunjukkan koefisien basis spline (tidak mudah diinterpretasi langsung, tapi penting untuk prediksi).

ggplot(df_airquality,aes(x=Solar.R, y=Ozone)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Cubic Spline"),se = F)+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
  scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="blue"))+theme_bw()
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 42 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

Garis biru = cubic spline, merah = natural spline. Keduanya mengikuti pola data lebih baik daripada regresi linear.

Natural spline lebih “jinak” di ekor → cocok untuk data lingkungan yang rawan outlier.

Dari sini terlihat: spline memberi fleksibilitas untuk menangkap hubungan non-linear.

library(splines)

mod_spline3ns <- lm(Ozone ~ ns(Solar.R, df = 3), data = df_airquality)

Komparasi Model

# Fungsi MSE
MSE <- function(pred, actual) {
  mean((pred - actual)^2, na.rm = TRUE)
}

# Buat prediksi dari masing-masing model
pred_linear  <- predict(mod_linear)
pred_tangga  <- predict(mod_tangga)
pred_spline  <- predict(mod_spline3)
pred_nspline <- predict(mod_spline3ns)
compare_stats <- data.frame(
  Model   = c("Linear","Tangga","Spline","Natural Spline"),
  MSE     = c(MSE(pred_linear, df_airquality$Ozone),
              MSE(pred_tangga, df_airquality$Ozone),
              MSE(pred_spline, df_airquality$Ozone),
              MSE(pred_nspline, df_airquality$Ozone)),
  AIC     = c(AIC(mod_linear),
              AIC(mod_tangga),
              AIC(mod_spline3),
              AIC(mod_spline3ns)),
  Adj_R2  = c(summary(mod_linear)$adj.r.squared,
              summary(mod_tangga)$adj.r.squared,
              summary(mod_spline3)$adj.r.squared,
              summary(mod_spline3ns)$adj.r.squared)
)
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
## Warning in pred - actual: longer object length is not a multiple of shorter
## object length
compare_stats
##            Model      MSE      AIC    Adj_R2
## 1         Linear 1293.729 1083.714 0.1132809
## 2         Tangga 1405.729 1076.964 0.1871312
## 3         Spline 1429.093 1078.108 0.1992623
## 4 Natural Spline 1396.692 1070.553 0.2261018