Nonlinear Regression

Febrian Adhitya Cahya Belardi

Library

library(MultiKink)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(ISLR)
library(splines)

Nonlinear Regression Data Auto

Data

Data yang digunakan merupakan Auto Data Set dari package ISLR dengan 392 amatan. Data terdiri dari tiga kolom yaitu, mpg, horsepower, dan origin.

  • mpg: miles per galon

  • horsepower: Engine horsepower

  • origin: Origin of car (1. American, 2. European, 3. Japanese)

AutoData = Auto %>% select(mpg, horsepower, origin)
tibble(AutoData)

Jika kita gambarkan dalam bentuk scatterplot

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Linear

reglinear = lm(horsepower~mpg,data=AutoData)
summarylinear <- summary(reglinear)
summarylinear
## 
## Call:
## lm(formula = horsepower ~ mpg, data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.892 -15.716  -2.094  13.108  96.947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 194.4756     3.8732   50.21   <2e-16 ***
## mpg          -3.8389     0.1568  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.19 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
summarylinear$r.squared
## [1] 0.6059483
tabeldata <- rbind(data.frame("y_aktual"=AutoData$horsepower,
                               "y_pred"=predict(reglinear),
                               "residuals"=residuals(reglinear)))
head(tabeldata)
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
   geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

Regresi Polinomial

Derajat 2 (ordo 2)

modelpolinomial = lm(horsepower ~ poly(mpg,2,raw = T),
                     data=AutoData)
summary(modelpolinomial)
## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 2, raw = T), data = AutoData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72.52 -11.24  -0.11   9.47  92.98 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            302.06824    9.25689   32.63   <2e-16 ***
## poly(mpg, 2, raw = T)1 -13.32406    0.77456  -17.20   <2e-16 ***
## poly(mpg, 2, raw = T)2   0.18804    0.01513   12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.49 on 389 degrees of freedom
## Multiple R-squared:  0.718,  Adjusted R-squared:  0.7165 
## F-statistic: 495.1 on 2 and 389 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

ggplot(AutoData,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

Derajat 3 (ordo 3)

modelpolinomial3 = lm(horsepower~ poly(mpg,3,raw = T),data=AutoData)
summary(modelpolinomial3)
## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 3, raw = T), data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.935 -11.251  -1.338   9.324  95.459 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            429.581461  23.823018  18.032  < 2e-16 ***
## poly(mpg, 3, raw = T)1 -30.131244   3.006538 -10.022  < 2e-16 ***
## poly(mpg, 3, raw = T)2   0.866793   0.118533   7.313 1.51e-12 ***
## poly(mpg, 3, raw = T)3  -0.008506   0.001474  -5.770 1.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 388 degrees of freedom
## Multiple R-squared:  0.7403, Adjusted R-squared:  0.7382 
## F-statistic: 368.6 on 3 and 388 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

Regresi Fungsi Tangga

Fungsi Tangga (7)

modeltangga = lm(horsepower ~ cut(mpg,7),data=AutoData)
summary(modeltangga)
## 
## Call:
## lm(formula = horsepower ~ cut(mpg, 7), data = AutoData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.48 -12.96  -2.25  10.31 105.52 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             169.692      2.960   57.32   <2e-16 ***
## cut(mpg, 7)(14.4,19.7]  -45.208      3.669  -12.32   <2e-16 ***
## cut(mpg, 7)(19.7,25.1]  -74.908      3.734  -20.06   <2e-16 ***
## cut(mpg, 7)(25.1,30.5]  -88.317      3.885  -22.73   <2e-16 ***
## cut(mpg, 7)(30.5,35.9]  -96.865      4.186  -23.14   <2e-16 ***
## cut(mpg, 7)(35.9,41.2] -100.442      5.268  -19.07   <2e-16 ***
## cut(mpg, 7)(41.2,46.6] -111.978      8.594  -13.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.35 on 385 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6924 
## F-statistic: 147.7 on 6 and 385 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Fungsi Tangga (10)

modeltangga10= lm(mpg ~ cut(horsepower,10),data=AutoData)
summary(modeltangga10)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 10), data = AutoData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5663  -2.8183  -0.2674   2.3709  16.0337 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     33.8963     0.8803  38.503  < 2e-16 ***
## cut(horsepower, 10)(64.4,82.8]  -3.3300     0.9976  -3.338 0.000927 ***
## cut(horsepower, 10)(82.8,101]  -10.0780     0.9744 -10.343  < 2e-16 ***
## cut(horsepower, 10)(101,120]   -13.0463     1.1183 -11.666  < 2e-16 ***
## cut(horsepower, 10)(120,138]   -16.0113     1.3495 -11.864  < 2e-16 ***
## cut(horsepower, 10)(138,156]   -18.6289     1.1090 -16.798  < 2e-16 ***
## cut(horsepower, 10)(156,175]   -19.8040     1.5442 -12.825  < 2e-16 ***
## cut(horsepower, 10)(175,193]   -20.5392     1.5065 -13.633  < 2e-16 ***
## cut(horsepower, 10)(193,212]   -22.0963     2.2271  -9.921  < 2e-16 ***
## cut(horsepower, 10)(212,230]   -20.5213     1.8414 -11.145  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.574 on 382 degrees of freedom
## Multiple R-squared:  0.6644, Adjusted R-squared:  0.6565 
## F-statistic: 84.03 on 9 and 382 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,10), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Perbandingan Model Nonlinear Regression

Membandingkan model

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
model <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)","Polynomial (Ordo=3)", "Tangga breaks=7", "Tangga breaks=10"), "AIC" = c(AIC(reglinear),AIC(modelpolinomial),AIC(modelpolinomial3),AIC(modeltangga),AIC(modeltangga10)), "MSE" = c(MSE(predict(reglinear),AutoData$mpg),
              MSE(predict(modelpolinomial),AutoData$mpg),
              MSE(predict(modelpolinomial3),AutoData$mpg),
              MSE(predict(modeltangga),AutoData$mpg),
              MSE(predict(modeltangga10),AutoData$mpg)))
knitr::kable(model)
Model AIC MSE
Linear 3614.324 7987.55223
Polynomial (Ordo=2) 3485.217 8153.09082
Polynomial (Ordo=3) 3454.949 8186.02560
Tangga breaks=7 3521.117 8103.01147
Tangga breaks=10 2316.374 20.39147

Berdasarkan nilai AIC dan MSE, model terbaik saat ini ialah model tangga dengan jumlah 10 selang karena memiliki nilai AIC dan MSE paling rendah dibandingkan model lainnya.

Regresi Spline

#nilai knots yang ditentukan oleh komputer
attr(bs(AutoData$mpg, df=6),"knots")
##   25%   50%   75% 
## 17.00 22.75 29.00

b-spline

Menggunakan knot yang ditentukan fungsi komputer

knotvalue = attr(bs(AutoData$mpg, df=6),"knots")
modelspline1 = lm(horsepower ~ bs(mpg, knots =knotvalue ),data=AutoData)
summary(modelspline1)
## 
## Call:
## lm(formula = horsepower ~ bs(mpg, knots = knotvalue), data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.043 -10.103  -0.818   8.075  95.778 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  195.012     13.181  14.795  < 2e-16 ***
## bs(mpg, knots = knotvalue)1    2.792     18.879   0.148    0.882    
## bs(mpg, knots = knotvalue)2  -80.488     12.649  -6.363 5.62e-10 ***
## bs(mpg, knots = knotvalue)3 -103.774     14.645  -7.086 6.62e-12 ***
## bs(mpg, knots = knotvalue)4 -126.115     14.600  -8.638  < 2e-16 ***
## bs(mpg, knots = knotvalue)5 -127.143     18.836  -6.750 5.45e-11 ***
## bs(mpg, knots = knotvalue)6 -141.413     18.226  -7.759 7.77e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.56 on 385 degrees of freedom
## Multiple R-squared:  0.7457, Adjusted R-squared:  0.7417 
## F-statistic: 188.1 on 6 and 385 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = knotvalue), 
               lty = 1,se = F)

Natural Spline

modelsplinenatural = lm(horsepower ~ ns(mpg, knots = knotvalue),data=AutoData)
summary(modelsplinenatural)
## 
## Call:
## lm(formula = horsepower ~ ns(mpg, knots = knotvalue), data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.461 -10.731  -1.433   8.759  95.650 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  218.000      6.786   32.13   <2e-16 ***
## ns(mpg, knots = knotvalue)1 -130.511      6.470  -20.17   <2e-16 ***
## ns(mpg, knots = knotvalue)2 -109.368      6.183  -17.69   <2e-16 ***
## ns(mpg, knots = knotvalue)3 -237.134     15.815  -14.99   <2e-16 ***
## ns(mpg, knots = knotvalue)4 -115.116      8.615  -13.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.62 on 387 degrees of freedom
## Multiple R-squared:  0.7429, Adjusted R-squared:  0.7403 
## F-statistic: 279.6 on 4 and 387 DF,  p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black")+
      stat_smooth(method = "lm", 
                 formula = y~ns(x, knots = knotvalue), 
                 lty = 1,se=F)

## Smoothing Spline

Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R

modelsmooth<- with(data = AutoData,smooth.spline(horsepower,mpg))
modelsmooth 
## Call:
## smooth.spline(x = horsepower, y = mpg)
## 
## Smoothing Parameter  spar= 0.2648644  lambda= 5.101567e-07 (12 iterations)
## Equivalent Degrees of Freedom (Df): 42.14987
## Penalized Criterion (RSS): 1117.657
## GCV: 18.64132

Fungsi smooth.spline secara otomatis memilih paramter lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized Cross-Validation (GCV). Banyaknya folds (lipatan) pada CV dan GCV ini adalah sejumlah observasinya.

predictiondata <- broom::augment(modelsmooth)

ggplot(predictiondata,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("mpg")+
  ylab("horsepower")+
  theme_bw()

Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

modelsmoothlambda <- data.frame(lambda=seq(0,5,by=0.5)) %>% 
  group_by(lambda) %>% 
  do(broom::augment(with(data = AutoData,smooth.spline(mpg,horsepower,lambda = .$lambda))))

p <- ggplot(modelsmoothlambda,
       aes(x=x,y=y))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~lambda)
p

Jika kita tentukan df=7, maka hasil kurva model smooth.spline akan lebih merepresentasikan data

modelsmooth2 <- with(data = AutoData,smooth.spline(horsepower,mpg,df=7))
modelsmooth2 
## Call:
## smooth.spline(x = horsepower, y = mpg, df = 7)
## 
## Smoothing Parameter  spar= 0.7927002  lambda= 0.003320238 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.999082
## Penalized Criterion (RSS): 2424.819
## GCV: 18.84972
pred_data <- broom::augment(modelsmooth2)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("mpg")+
  ylab("horsepower")+
  theme_bw()

LOESS

Kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().

modloess <- loess(horsepower~mpg,
                     data = AutoData)
summary(modloess)
## Call:
## loess(formula = horsepower ~ mpg, data = AutoData)
## 
## Number of Observations: 392 
## Equivalent Number of Parameters: 4.8 
## Residual Standard Error: 19.58 
## Trace of smoother matrix: 5.24  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

modloessspan <- data.frame(span=seq(0.1,5,by=0.5)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(horsepower~mpg,
                     data = AutoData,span=.$span)))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 6.6352e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
p2 <- ggplot(modloessspan,
       aes(x=mpg,y=horsepower))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~span)
p2

Pendekatan ini juga dapat dilakukan dengan fungsi stat_smooth() pada package ggplot2.

library(ggplot2)
ggplot(AutoData, aes(horsepower,mpg)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
               formula=y~x,
              span = 0.75,
              col="blue",
              lty=1,
              se=F)

Evaluasi Model Menggunakan Cross Validation

Regresi Linear

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
metriclinear <- map_dfr(crossval$splits,
    function(x){
    mod <- lm(horsepower ~ mpg,data=AutoData[x$in_id,])
    pred <- predict(mod,newdata=AutoData[-x$in_id,])
    truth <- AutoData[-x$in_id,]$horsepower
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metriclinear
# menghitung rata-rata untuk 10 folds
meanmetriclinear <- colMeans(metriclinear)
meanmetriclinear
##     rmse      mae 
## 24.04699 18.43851

Polynomial Derajat 2 (ordo 2)

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
metricpoly <- map_dfr(crossval$splits,
function(x){
  mod <- lm(horsepower ~ poly(mpg,2,raw = T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metricpoly
# menghitung rata-rata untuk 10 folds
meanmetricpoly <- colMeans(metricpoly)
meanmetricpoly
##     rmse      mae 
## 20.35446 14.88426

Polynomial Derajat 3 (ordo 3)

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")

metricpoly3 <- map_dfr(crossval$splits,
function(x){
  mod <- lm(horsepower ~ poly(mpg,3,raw = T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metricpoly3
# menghitung rata-rata untuk 10 folds
meanmetricpoly3 <- colMeans(metricpoly3)
meanmetricpoly3
##     rmse      mae 
## 19.48501 14.29354

Fungsi Tangga

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
breaks <- 7:10
besttangga <- map_dfr(breaks, function(i){
    metrictangga <- map_dfr(crossval$splits,
    function(x){
        training <- AutoData[x$in_id,]
        training$mpg <- cut(training$mpg,i)
        mod <- lm(horsepower ~ mpg ,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- AutoData[-x$in_id,]
        mpg_new <- cut(testing$mpg,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(mpg=mpg_new))
        truth <- testing$horsepower
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metrictangga
  # menghitung rata-rata untuk 10 folds
  meanmetrictangga <- colMeans(metrictangga)
  meanmetrictangga
  }
)

besttangga <- cbind(breaks=breaks,besttangga)
# menampilkan hasil all breaks
besttangga
#berdasarkan rmse
besttangga %>% slice_min(rmse)
#berdasarkan mae
besttangga %>% slice_min(mae)

Spline regression

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")

bestspline <- map_dfr(crossval$splits,
function(x){
  mod <- lm(horsepower ~ bs(mpg,knots=knotvalue),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
## Warning in bs(mpg, degree = 3L, knots = c(`25%` = 17, `50%` = 22.75, `75%` = 29:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(mpg, degree = 3L, knots = c(`25%` = 17, `50%` = 22.75, `75%` = 29:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
bestspline <- colMeans(bestspline)
bestspline
##     rmse      mae 
## 19.39520 14.07474

Natural Spline Regression

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
bestnatural <- map_dfr(crossval$splits,
function(x){
  mod <- lm(horsepower ~ ns(mpg,knots=knotvalue),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
bestnatural <- colMeans(bestnatural)
bestnatural
##     rmse      mae 
## 19.41492 14.15844

LOESS

set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")

span <- seq(0.1,1,length.out=50)

bestloess <- map_dfr(span, function(i){
metricloess <- map_dfr(crossval$splits,
    function(x){
mod <- loess(horsepower ~mpg,span = i,
         data=AutoData[x$in_id,])
pred <- predict(mod,
               newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metricloess,20)

# menghitung rata-rata untuk 10 folds
meanmetricloess <- colMeans(metricloess)

meanmetricloess
}
)

bestloess <- cbind(span=span,bestloess)
# menampilkan hasil all breaks
head(bestloess)
#berdasarkan rmse
bestloess %>% slice_min(rmse)

Perbandingan Model

nilai_metric <- rbind(meanmetriclinear,
                      meanmetricpoly,
                      meanmetricpoly3,
                      besttangga[4,2:3],
                      bestspline,
                      bestnatural,
                      bestloess[27,2:3])

rownames(nilai_metric) <- c("Linear","Polinomial ordo=2","Polinomial ordo=3","Tangga breaks=10","Spline Regression", "Natural Spline Regression", "LOESS")
knitr::kable(nilai_metric)
rmse mae
Linear 24.04699 18.43851
Polinomial ordo=2 20.35446 14.88426
Polinomial ordo=3 19.48501 14.29354
Tangga breaks=10 20.08860 14.34280
Spline Regression 19.39520 14.07474
Natural Spline Regression 19.41492 14.15844
LOESS 19.28214 13.77604

Simpulan

Berdasarkan hasil Cross-Validation 10 folds, model LOESS merupakan model terbaik untuk hubungan data horsepower~mpg karena memiliki nilai rmse dan mae yang paling kecil dibanding model lainnya.