library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)

Data

data("triceps",package="MultiKink")
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

Regresi Linear

mod_linear = lm(triceps~age,data=triceps)
summary(mod_linear)
## 
## Call:
## lm(formula = triceps ~ age, data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9512  -2.3965  -0.5154   1.5822  25.1233 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.19717    0.21244   29.17   <2e-16 ***
## age          0.21584    0.01014   21.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared:  0.3372, Adjusted R-squared:  0.3365 
## F-statistic: 452.8 on 1 and 890 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Polinomial

mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),data=triceps)
summary(mod_polinomial2)
## 
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5677  -2.4401  -0.4587   1.6368  24.9961 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.0229191  0.3063806  19.658  < 2e-16 ***
## poly(age, 2, raw = T)1  0.2434733  0.0364403   6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257  0.0007926  -0.789     0.43    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared:  0.3377, Adjusted R-squared:  0.3362 
## F-statistic: 226.6 on 2 and 889 DF,  p-value: < 2.2e-16
  ggplot(triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=triceps)
summary(mod_polinomial3)
## 
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5832  -1.9284  -0.5415   1.3283  24.4440 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.004e+00  3.831e-01  20.893  < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01  7.721e-02  -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2  3.101e-02  3.964e-03   7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04  5.612e-05  -8.135 1.38e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared:  0.3836, Adjusted R-squared:  0.3815 
## F-statistic: 184.2 on 3 and 888 DF,  p-value: < 2.2e-16

Regresi Fungsi Tangga

mod_tangga = lm(triceps ~ cut(age,5),data=triceps)
summary(mod_tangga)
## 
## Call:
## lm(formula = triceps ~ cut(age, 5), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5474  -2.0318  -0.4465   1.3682  23.3759 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.2318     0.1994  36.260  < 2e-16 ***
## cut(age, 5)(10.6,20.9]   1.6294     0.3244   5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2]   5.9923     0.4222  14.192  < 2e-16 ***
## cut(age, 5)(31.2,41.5]   7.5156     0.4506  16.678  < 2e-16 ***
## cut(age, 5)(41.5,51.8]   7.4561     0.5543  13.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.3588 
## F-statistic: 125.7 on 4 and 887 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Splines

mod_spline3 = lm(triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3)
## 
## Call:
## lm(formula = triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)), 
##     data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5234  -1.6912  -0.2917   1.1356  23.0922 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              6.9598     0.9729   7.154 1.77e-12 ***
## bs(age, knots = c(5, 10, 20, 30, 40))1   2.5367     1.7154   1.479   0.1396    
## bs(age, knots = c(5, 10, 20, 30, 40))2  -0.3032     0.9629  -0.315   0.7529    
## bs(age, knots = c(5, 10, 20, 30, 40))3  -1.9092     1.2993  -1.469   0.1421    
## bs(age, knots = c(5, 10, 20, 30, 40))4   7.4056     1.2179   6.081 1.78e-09 ***
## bs(age, knots = c(5, 10, 20, 30, 40))5   6.1050     1.4043   4.347 1.54e-05 ***
## bs(age, knots = c(5, 10, 20, 30, 40))6  10.1770     1.5427   6.597 7.23e-11 ***
## bs(age, knots = c(5, 10, 20, 30, 40))7   3.9428     1.9082   2.066   0.0391 *  
## bs(age, knots = c(5, 10, 20, 30, 40))8  10.1473     1.7545   5.784 1.01e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.743 on 883 degrees of freedom
## Multiple R-squared:  0.4261, Adjusted R-squared:  0.4209 
## F-statistic: 81.94 on 8 and 883 DF,  p-value: < 2.2e-16
mod_spline3ns = lm(triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)), 
##     data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4875  -1.6873  -0.3665   1.1146  22.8643 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              8.3811     0.5219  16.059  < 2e-16 ***
## ns(age, knots = c(5, 10, 20, 30, 40))1  -3.5592     0.6712  -5.303 1.44e-07 ***
## ns(age, knots = c(5, 10, 20, 30, 40))2   5.7803     1.0379   5.569 3.39e-08 ***
## ns(age, knots = c(5, 10, 20, 30, 40))3   5.5118     0.9416   5.853 6.78e-09 ***
## ns(age, knots = c(5, 10, 20, 30, 40))4   6.9070     0.9050   7.632 5.99e-14 ***
## ns(age, knots = c(5, 10, 20, 30, 40))5   5.4136     1.3783   3.928 9.24e-05 ***
## ns(age, knots = c(5, 10, 20, 30, 40))6   6.6460     1.0829   6.137 1.27e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.759 on 885 degrees of freedom
## Multiple R-squared:  0.4199, Adjusted R-squared:  0.416 
## F-statistic: 106.8 on 6 and 885 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Cubic Spline"),se = F)+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
  scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="blue"))+theme_bw()

Model

Membuat fungsi MSE

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

Membandingkan Model

data.frame(Linear=MSE(predict(mod_linear),triceps$triceps),
           Poly2=MSE(predict(mod_polinomial2),triceps$triceps),
           Poly3=MSE(predict(mod_polinomial3),triceps$triceps),
           Tangga=MSE(predict(mod_tangga),triceps$triceps),
           Spline=MSE(predict(mod_spline3),triceps$triceps),
           NSpline=MSE(predict(mod_spline3ns),triceps$triceps)
           )
##     Linear    Poly2    Poly3   Tangga   Spline  NSpline
## 1 16.01758 16.00636 14.89621 15.42602 13.87026 14.01904

Smoothing Splines

Menurut Harezlak dan Teixeira-Pinto (2020), pendekatan spline dapat pula dilakukan dengan ide yang mirip seperti yang dilakukan pada regularisasi. Metode smoothing spline menggunakan penalti pada jumlah kuadrat sisaan seperti yang terlihat pada rumus berikut ini:

Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R. Sebagai ilustrasi, data triceps yang tersedia di package MultiKink akan digunakan.

data("triceps")
## Warning in data("triceps"): data set 'triceps' not found
sspline <- smooth.spline(triceps$age, triceps$triceps,lambda=0.5)
plot(triceps$age, triceps$triceps)
lines(sspline, col="blue")

Seandainya kita menggunakan lambda yang lebih kecil, maka hasil pemulusan menjadi lebih kasar, seperti yang terlihat pada ilustrasi berikut.

sspline2 <- smooth.spline(triceps$age, triceps$triceps,lambda=0.0001)
plot(triceps$age, triceps$triceps)
lines(sspline2, col="blue")

Pengaturan juga dapat dilakukan dengan mengubah nilai pada argumen spar.

x = seq(1:18)
y = c(1:3,5,4,7:3,2*(2:5),rep(10,4))
splineres <- function(spar){
  res <- rep(0, length(x))
  for (i in 1:length(x)){
    mod <- smooth.spline(x[-i], y[-i], spar = spar)
    res[i] <- predict(mod, x[i])$y - y[i]
  }
  return(sum(res^2))
}

spars <- seq(0, 1.5, by = 0.001)
ss <- rep(0, length(spars))
for (i in 1:length(spars)){
  ss[i] <- splineres(spars[i])
}
plot(spars, ss, 'l', xlab = 'spar', ylab = 'Cross Validation Residual Sum of Squares' , main = 'CV RSS vs Spar')

spars[which.min(ss)]
## [1] 0.381
# R > spars[which.min(ss)]

Splines for Wage

Berikut ini adalah ilustrasi yang diambil dari Ganegja (2018) menggunakan data Wage yang diambil dari package ISLR. Pada ilustrasi ini, pengaturan dilakukan dengan argumen df.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
data(Wage)
attach(Wage)
agelims =range(age)
plot(age ,wage ,xlim=agelims ,cex =.5, col =" darkgrey ")
title (" Smoothing Spline ")
fit=smooth.spline (age ,wage ,df =16)
fit2=smooth.spline (age ,wage ,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
lines(fit,col ="red ",lwd =2)
lines(fit2,col =" blue",lwd=2)
legend ("topright",legend =c("16 DF " ,"6.8 DF"), col=c("red "," blue "),
        lty =1, lwd =2, cex =.8)

Local Regression

Masih menggunakan data Wage seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().

plot(age ,wage ,xlim=agelims ,cex =.5, col =" darkgrey ")
title (" Local Regression ")
fit=loess(wage~age ,span =.2, data=Wage)
fit2=loess(wage~age ,span =.5, data=Wage)
age.grid=seq (from=agelims[1], to=agelims[2])
lines(age.grid,predict(fit ,data.frame(age=age.grid)), col ="red",lwd =2)
lines(age.grid,predict(fit2 ,data.frame(age=age.grid)), col ="blue",lwd =2)
legend("topright",legend =c("Span =0.2" ," Span =0.5"), col=c("red","blue"),lty =1, 
       lwd =2, cex =.8)

Pendekatan ini juga dapat dilakukan dengan fungsi geom_smooth() pada package ggplot2.

library(ggplot2)
ggplot(Wage, aes(age,wage)) +
  geom_point() +
  geom_smooth(method='loess')
## `geom_smooth()` using formula 'y ~ x'

Tuning dapat dilakukan dengan mengatur nilai span, seperti yang dapat dilihat pada ilustrasi berikut.

ggplot(Wage, aes(age,wage)) +
  geom_point() +
  geom_smooth(method='loess', span=0.1, colour="red")
## `geom_smooth()` using formula 'y ~ x'

ggplot(Wage, aes(age,wage)) +
  geom_point() +
  geom_smooth(method='loess', span=0.8, colour="blue") 
## `geom_smooth()` using formula 'y ~ x'

detach(Wage)