REGRESI NON-LINEAR

Package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.5
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
library(caret) # cross-validation
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gridExtra) # combining graphs
## Warning: package 'gridExtra' was built under R version 4.0.5
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
library(dplyr)
library(purrr)
library(dplyr)
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.5
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gam) # generalized additive models
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.20

Eksplorasi Data

Data Dunia

Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:

mpg : miles per gallon

horsepower: Engine horsepower

origin: Origin of car (1. American, 2. European, 3. Japanese)

Auto = Auto %>%  select(mpg, horsepower, origin)
datatable(Auto)

Visualisasi

plot(Auto$horsepower, Auto$mpg,pch=19, col="#0000ff",
     xlab="Horse Power", ylab="mile per gallon")

Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua peubah/variabel, yaitu kekuatan mesin (horse power pada sumbu X) dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya (mile per gallon pada sumbu y).

Secara umum,kendaraan dengan horse power yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil.

Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.

Dalam kasus kali ini akan digunakan pendekatan regresi polinomial, Piecewise constant, dan Natural cubic splines.

Data Menurut Negara

Asal Negara Produsen Amerika

AutoAmerika = Auto[Auto$origin==1,] %>%  select(mpg, horsepower, origin)
datatable(AutoAmerika)
ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Amerika")

Asal Negara Produsen Eropa

AutoEropa = Auto[Auto$origin==2,] %>%  select(mpg, horsepower, origin)
datatable(AutoEropa)
ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Eropa")

Asal Negara Produsen Jepang

AutoJepang = Auto[Auto$origin==3,] %>%  select(mpg, horsepower, origin)
datatable(AutoJepang)
ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Jepang")

Regresi Data Dunia

Regresi Polinomial

cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.

set.seed(1501211011)

cv.poly1 <- function(dataset){
  set.seed(1501211011)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,1,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(RegLin) 
}

cv.poly2 <- function(dataset){
  set.seed(1501211011)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly3 <- function(dataset){
  set.seed(1501211011)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,3,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
    }

cv.poly4 <- function(dataset){
  set.seed(1501211011)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,4,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly
##          rmse      mae
## [1,] 4.895036 3.841085
## [2,] 4.363221 3.267947
## [3,] 4.402158 3.286869
## [4,] 4.428254 3.310634

Secara empiris jika dilihat dari rmse terlihat bahwa polinomial pangkat 2 memberikan nilai rmse yang terkecil sehingga dipilih polinomial pangkat 2.

poly1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga (Piecewise Constant)

Secara Empiris

set.seed(123)

cv.pc <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")

breaks <- 3:12

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataset[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.

Secara Visual

pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc12 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,12), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc8, pc12,  labels = c( "pc8", "pc12"), label_size = 10)

Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 8.

Natural Cubic Spline

Menentukan Banyaknya Fungsi Basis

set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
                  pred <- predict(mod,newdata=Auto[-x$in_id,])
                  truth <- Auto[-x$in_id,]$mpg
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis

Banyaknya fungsi basis terpilih ada 10.

Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih

df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7

Cross Validation

set.seed(123)

cross.val <- vfold_cv(Auto,v=5,strata = "mpg")

metric.spline3.ns4 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds



metric.spline3.ns9 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93.5,100,110,140,157.7)),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
                     "NCS9" = mean.metric.spline3.ns9)

nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
                    "9 - NCS (df=10)")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67,72,80,88,93.5,100,110,140,157.7.

Visualisasi Grafik

nc4 <- ggplot(Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
  theme_bw()


nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 9 knot lebih dapat menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 9 knot.

Perbandingan Model

Secara Empiris

perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model

Berarti, model terbaik adalah NCS dengan 9 knot.

poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()

plot_grid(poly2, pc8, nc9, labels = c("poly2","pc8", "nc9"), label_size = 6)

Regresi Data Amerika

Regresi Polinomial

cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.

cvpoly <- rbind (cv.poly1(AutoAmerika),cv.poly2(AutoAmerika),cv.poly3(AutoAmerika),cv.poly4(AutoAmerika))
cvpoly
##          rmse      mae
## [1,] 4.258648 3.343302
## [2,] 3.776830 2.871345
## [3,] 3.779673 2.889474
## [4,] 3.811238 2.892780

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.

poly1 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga (Piecewise Constant)

Secara Empiris

cv.pc(AutoAmerika)

Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11

Secara Visual

pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,9), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,11), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc9, pc11,  labels = c( "pc9", "pc11"), label_size = 6)

Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 11, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak). Jadi, pada metode piecewise constant ini dipilih interval 11.

Natural Cubic Spline

Menentukan Banyaknya Fungsi Basis

set.seed(123)
cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")
df <- 1:13
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoAmerika[x$in_id,])
                  pred <- predict(mod,newdata=AutoAmerika[-x$in_id,])
                  truth <- AutoAmerika[-x$in_id,]$mpg
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis

diperoleh df atau banyaknya fungsi basis terpilih sebesar 8.

Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih

df_8<-attr(ns(AutoAmerika$horsepower, df=8),"knots")
df_8
## 12.5%   25% 37.5%   50% 62.5%   75% 87.5% 
##    80    88    95   105   130   150   170

Cross Validation

set.seed(123)

cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")

metric.spline3.ns4 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
         data=AutoAmerika[x$in_id,])
pred <- predict(mod,
               newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds



metric.spline3.ns7 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80,88,95,105,130,150,170)),
         data=AutoAmerika[x$in_id,])
pred <- predict(mod,
               newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns7 <- colMeans(metric.spline3.ns7)# menghitung rata-rata untuk 5folds



nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
                     "NCS7" = mean.metric.spline3.ns7)

nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
                    "7 - NCS (df=8)")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 7 knot.

Visualisasi Grafik

nc4 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
  theme_bw()


nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc4, nc7, labels = c("nc4", "nc7"), label_size = 8)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 7 knot lebih menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 7 knot.

Perbandingan Model

perbandingan.model <- rbind(cv.poly2(AutoAmerika), cv.pc(AutoAmerika)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 11","NCS 7 Knot")
perbandingan.model

Berarti, model terbaik untuk data Auto Amerika adalah Piecewise Constant dengan 11 interval.

poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,11), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
  theme_bw()

plot_grid(poly2, pc11, nc7, labels = c("poly2","pc11", "nc7"), label_size = 6)

Regresi Data Eropa

Regresi Polinomial

cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.

set.seed(123)


cv.poly2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly3 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,3,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
    }

cv.poly4 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,4,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly2(AutoEropa),cv.poly3(AutoEropa),cv.poly4(AutoEropa))
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly
##          rmse      mae
## [1,] 5.024633 3.948212
## [2,] 5.073471 4.008398
## [3,] 5.117442 4.015371

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.

poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga (Piecewise Constant)

Secara Empiris

set.seed(123)

cv.pc2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")

breaks <- 2:6

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataset[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc2(AutoEropa)
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5

Secara Visual

pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc10 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,10), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc5, pc10,  labels = c( "pc5", "pc10"), label_size = 6)

Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5. Begitu juga secara visual.

Natural Cubic Spline

Menentukan Banyaknya Fungsi Basis

set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoEropa[x$in_id,])
                  pred <- predict(mod,newdata=AutoEropa[-x$in_id,])
                  truth <- AutoEropa[-x$in_id,]$mpg
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis

Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih

df_2<-attr(ns(AutoEropa$horsepower, df=2),"knots")
df_2
##  50% 
## 76.5

Cross Validation

set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns1 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(76.5)),
         data=AutoEropa[x$in_id,])
pred <- predict(mod,
               newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns1 <- colMeans(metric.spline3.ns1)# menghitung rata-rata untuk 5 folds



metric.spline3.ns2 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(88, 105)),
         data=AutoEropa[x$in_id,])
pred <- predict(mod,
               newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS1" = mean.metric.spline3.ns1,
                     "NCS2" = mean.metric.spline3.ns2)

nama.model.ncs <- c("1 - NCS Knots 76.5",
                    "2 - NCS Knots 80, 150")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 1 knot.

Visualisasi Grafik

nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
  theme_bw()


nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,105)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc1, nc2, labels = c("nc1", "nc2"), label_size = 8)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 1 knot lebih menggambarkan data dibanding dengan 2 knot. Oleh karena itu dipilih 1 knot.

Perbandingan Model

perbandingan.model <- rbind(cv.poly2(AutoEropa), cv.pc2(AutoEropa)[1,-1], model.ncs[1,-1])
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 5","NCS 1 Knot 105")
perbandingan.model

Berarti, model terbaik untuk data Auto Eropa adalah NCS 1 Knot 105.

poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
  theme_bw()


plot_grid(poly2, pc5,nc1, labels = c("poly2","pc5","nc1"), label_size = 6)

Regresi Data Jepang

Regresi Polinomial

cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.

set.seed(123)

cv.poly2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly3 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,3,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
    }

cv.poly4 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(mpg ~ poly(horsepower,4,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly2(AutoJepang),cv.poly3(AutoJepang),cv.poly4(AutoJepang))
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly
##          rmse      mae
## [1,] 4.669195 3.580362
## [2,] 4.081574 3.074327
## [3,] 4.320975 3.240185

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 3 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 3.

poly2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga (Piecewise Constant)

Secara Empiris

set.seed(123)

cv.pc2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")

breaks <- 2:5

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataset[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc2(AutoJepang)
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.

Secara Visual

pc2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,2), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,4), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc2, pc4,  labels = c( "pc2", "pc4"), label_size = 6)

Sebelumnya Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.Secara visual, interval 4 lebih baik sehingga dipilih interval 4.

Natural Cubic Spline

Menentukan Banyaknya Fungsi Basis

set.seed(123)
cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoJepang[x$in_id,])
                  pred <- predict(mod,newdata=AutoJepang[-x$in_id,])
                  truth <- AutoJepang[-x$in_id,]$mpg
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis

Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih

df_3<-attr(ns(AutoJepang$horsepower, df=3),"knots")
df_3
## 33.33333% 66.66667% 
##        68        90

Cross Validation

set.seed(123)

cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns2 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(68,90)),
         data=AutoJepang[x$in_id,])
pred <- predict(mod,
               newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds



metric.spline3.ns3 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(60, 88, 105)),
         data=AutoJepang[x$in_id,])
pred <- predict(mod,
               newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns3 <- colMeans(metric.spline3.ns3)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS2" = mean.metric.spline3.ns2,
                     "NCS3" = mean.metric.spline3.ns3)

nama.model.ncs <- c("2 - NCS Knots 68,90",
                    "3 - NCS Knots 60, 88, 105")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 2 knot.

Visualisasi Grafik

nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
  theme_bw()


nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(60, 88, 105)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc2, nc3, labels = c("nc2", "nc3"), label_size = 8)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 2 knot lebih menggambarkan data dibanding dengan 3 knot. Oleh karena itu dipilih 2 knot.

Perbandingan Model

perbandingan.model <- rbind(cv.poly3(AutoJepang), cv.pc2(AutoJepang)[1,-1], model.ncs[1,-1])
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 3","Piecewise Constant 4","NCS 2 Knot 68,90")
perbandingan.model

Berarti, model terbaik untuk data Auto Jepang adalah Polinomial Ordo 3.

poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,4), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
  theme_bw()

plot_grid(poly3,pc4,nc2, labels=c("poly3","pc4","nc2"), label_size = 6)

Visualisasi Model Terbaik

nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()+
  labs(title = "NCS 9 Knots (df=10)", 
       subtitle = "Auto Data Set - World")

pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,11), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
  labs(title = "Piecewise Constant 11", 
       subtitle = "Auto Data Set - Amerika")

nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
  theme_bw()+
  labs(title = "Natural Cubic Splines Knots 76.5 (df=2)", 
       subtitle = "Auto Data Set - Eropa")

poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
  labs(title = "Polinomial Ordo 3", 
       subtitle = "Auto Data Set - Jepang")


plot_grid(nc9, pc11, nc1, poly3)

Referensi

Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear

Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/