library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
data("triceps",package="MultiKink")
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
mod_linear = lm(triceps~age,data=triceps)
summary(mod_linear)
##
## Call:
## lm(formula = triceps ~ age, data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9512 -2.3965 -0.5154 1.5822 25.1233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.19717 0.21244 29.17 <2e-16 ***
## age 0.21584 0.01014 21.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared: 0.3372, Adjusted R-squared: 0.3365
## F-statistic: 452.8 on 1 and 890 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,lty = 1, col = "blue",se = F)+
theme_bw()
mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),data=triceps)
summary(mod_polinomial2)
##
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5677 -2.4401 -0.4587 1.6368 24.9961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0229191 0.3063806 19.658 < 2e-16 ***
## poly(age, 2, raw = T)1 0.2434733 0.0364403 6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257 0.0007926 -0.789 0.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared: 0.3377, Adjusted R-squared: 0.3362
## F-statistic: 226.6 on 2 and 889 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=triceps)
summary(mod_polinomial3)
##
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5832 -1.9284 -0.5415 1.3283 24.4440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.004e+00 3.831e-01 20.893 < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01 7.721e-02 -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2 3.101e-02 3.964e-03 7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04 5.612e-05 -8.135 1.38e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared: 0.3836, Adjusted R-squared: 0.3815
## F-statistic: 184.2 on 3 and 888 DF, p-value: < 2.2e-16
mod_tangga = lm(triceps ~ cut(age,5),data=triceps)
summary(mod_tangga)
##
## Call:
## lm(formula = triceps ~ cut(age, 5), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5474 -2.0318 -0.4465 1.3682 23.3759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2318 0.1994 36.260 < 2e-16 ***
## cut(age, 5)(10.6,20.9] 1.6294 0.3244 5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2] 5.9923 0.4222 14.192 < 2e-16 ***
## cut(age, 5)(31.2,41.5] 7.5156 0.4506 16.678 < 2e-16 ***
## cut(age, 5)(41.5,51.8] 7.4561 0.5543 13.452 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared: 0.3617, Adjusted R-squared: 0.3588
## F-statistic: 125.7 on 4 and 887 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
mod_spline3 = lm(triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3)
##
## Call:
## lm(formula = triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)),
## data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5234 -1.6912 -0.2917 1.1356 23.0922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.9598 0.9729 7.154 1.77e-12 ***
## bs(age, knots = c(5, 10, 20, 30, 40))1 2.5367 1.7154 1.479 0.1396
## bs(age, knots = c(5, 10, 20, 30, 40))2 -0.3032 0.9629 -0.315 0.7529
## bs(age, knots = c(5, 10, 20, 30, 40))3 -1.9092 1.2993 -1.469 0.1421
## bs(age, knots = c(5, 10, 20, 30, 40))4 7.4056 1.2179 6.081 1.78e-09 ***
## bs(age, knots = c(5, 10, 20, 30, 40))5 6.1050 1.4043 4.347 1.54e-05 ***
## bs(age, knots = c(5, 10, 20, 30, 40))6 10.1770 1.5427 6.597 7.23e-11 ***
## bs(age, knots = c(5, 10, 20, 30, 40))7 3.9428 1.9082 2.066 0.0391 *
## bs(age, knots = c(5, 10, 20, 30, 40))8 10.1473 1.7545 5.784 1.01e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.743 on 883 degrees of freedom
## Multiple R-squared: 0.4261, Adjusted R-squared: 0.4209
## F-statistic: 81.94 on 8 and 883 DF, p-value: < 2.2e-16
mod_spline3ns = lm(triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3ns)
##
## Call:
## lm(formula = triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)),
## data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4875 -1.6873 -0.3665 1.1146 22.8643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.3811 0.5219 16.059 < 2e-16 ***
## ns(age, knots = c(5, 10, 20, 30, 40))1 -3.5592 0.6712 -5.303 1.44e-07 ***
## ns(age, knots = c(5, 10, 20, 30, 40))2 5.7803 1.0379 5.569 3.39e-08 ***
## ns(age, knots = c(5, 10, 20, 30, 40))3 5.5118 0.9416 5.853 6.78e-09 ***
## ns(age, knots = c(5, 10, 20, 30, 40))4 6.9070 0.9050 7.632 5.99e-14 ***
## ns(age, knots = c(5, 10, 20, 30, 40))5 5.4136 1.3783 3.928 9.24e-05 ***
## ns(age, knots = c(5, 10, 20, 30, 40))6 6.6460 1.0829 6.137 1.27e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.759 on 885 degrees of freedom
## Multiple R-squared: 0.4199, Adjusted R-squared: 0.416
## F-statistic: 106.8 on 6 and 885 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Cubic Spline"),se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(5, 10, 20, 30, 40)),
lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="blue"))+theme_bw()
MSE = function(pred,actual){
mean((pred-actual)^2)
}
data.frame(Linear=MSE(predict(mod_linear),triceps$triceps),
Poly2=MSE(predict(mod_polinomial2),triceps$triceps),
Poly3=MSE(predict(mod_polinomial3),triceps$triceps),
Tangga=MSE(predict(mod_tangga),triceps$triceps),
Spline=MSE(predict(mod_spline3),triceps$triceps),
NSpline=MSE(predict(mod_spline3ns),triceps$triceps)
)
## Linear Poly2 Poly3 Tangga Spline NSpline
## 1 16.01758 16.00636 14.89621 15.42602 13.87026 14.01904
Menurut Harezlak dan Teixeira-Pinto (2020), pendekatan spline dapat pula dilakukan dengan ide yang mirip seperti yang dilakukan pada regularisasi. Metode smoothing spline menggunakan penalti pada jumlah kuadrat sisaan seperti yang terlihat pada rumus berikut ini:
Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R. Sebagai ilustrasi, data triceps yang tersedia di package MultiKink akan digunakan.
data("triceps")
## Warning in data("triceps"): data set 'triceps' not found
sspline <- smooth.spline(triceps$age, triceps$triceps,lambda=0.5)
plot(triceps$age, triceps$triceps)
lines(sspline, col="blue")
Seandainya kita menggunakan lambda yang lebih kecil, maka hasil pemulusan menjadi lebih kasar, seperti yang terlihat pada ilustrasi berikut.
sspline2 <- smooth.spline(triceps$age, triceps$triceps,lambda=0.0001)
plot(triceps$age, triceps$triceps)
lines(sspline2, col="blue")
Pengaturan juga dapat dilakukan dengan mengubah nilai pada argumen spar.
x = seq(1:18)
y = c(1:3,5,4,7:3,2*(2:5),rep(10,4))
splineres <- function(spar){
res <- rep(0, length(x))
for (i in 1:length(x)){
mod <- smooth.spline(x[-i], y[-i], spar = spar)
res[i] <- predict(mod, x[i])$y - y[i]
}
return(sum(res^2))
}
spars <- seq(0, 1.5, by = 0.001)
ss <- rep(0, length(spars))
for (i in 1:length(spars)){
ss[i] <- splineres(spars[i])
}
plot(spars, ss, 'l', xlab = 'spar', ylab = 'Cross Validation Residual Sum of Squares' , main = 'CV RSS vs Spar')
spars[which.min(ss)]
## [1] 0.381
# R > spars[which.min(ss)]
Berikut ini adalah ilustrasi yang diambil dari Ganegja (2018) menggunakan data Wage yang diambil dari package ISLR. Pada ilustrasi ini, pengaturan dilakukan dengan argumen df.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
data(Wage)
attach(Wage)
agelims =range(age)
plot(age ,wage ,xlim=agelims ,cex =.5, col =" darkgrey ")
title (" Smoothing Spline ")
fit=smooth.spline (age ,wage ,df =16)
fit2=smooth.spline (age ,wage ,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
lines(fit,col ="red ",lwd =2)
lines(fit2,col =" blue",lwd=2)
legend ("topright",legend =c("16 DF " ,"6.8 DF"), col=c("red "," blue "),
lty =1, lwd =2, cex =.8)
Masih menggunakan data Wage seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().
plot(age ,wage ,xlim=agelims ,cex =.5, col =" darkgrey ")
title (" Local Regression ")
fit=loess(wage~age ,span =.2, data=Wage)
fit2=loess(wage~age ,span =.5, data=Wage)
age.grid=seq (from=agelims[1], to=agelims[2])
lines(age.grid,predict(fit ,data.frame(age=age.grid)), col ="red",lwd =2)
lines(age.grid,predict(fit2 ,data.frame(age=age.grid)), col ="blue",lwd =2)
legend("topright",legend =c("Span =0.2" ," Span =0.5"), col=c("red","blue"),lty =1,
lwd =2, cex =.8)
Pendekatan ini juga dapat dilakukan dengan fungsi geom_smooth() pada package ggplot2.
library(ggplot2)
ggplot(Wage, aes(age,wage)) +
geom_point() +
geom_smooth(method='loess')
## `geom_smooth()` using formula 'y ~ x'
Tuning dapat dilakukan dengan mengatur nilai span, seperti yang dapat dilihat pada ilustrasi berikut.
ggplot(Wage, aes(age,wage)) +
geom_point() +
geom_smooth(method='loess', span=0.1, colour="red")
## `geom_smooth()` using formula 'y ~ x'
ggplot(Wage, aes(age,wage)) +
geom_point() +
geom_smooth(method='loess', span=0.8, colour="blue")
## `geom_smooth()` using formula 'y ~ x'
detach(Wage)