library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.20
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(ggplot2)
library(dplyr)
library(purrr)
library(knitr)
library(DT)

Eksplorasi Data

data yang digunakan pada project ini adalah data Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. variabel respon yang akan digunakan adalah variabel mpg dan variabel prediktor yang akan digunakan adalah housepower,

view(Auto)
Auto = Auto %>%  select(mpg, horsepower, displacement, weight)
datatable(Auto)

pertanyaan 1

Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk mendukung jawaban anda

#Visualisasi MPG-Horsepower

plot(Auto$horsepower, Auto$mpg,pch=19, col="#0000ff",
     xlab="Horse Power", ylab="mile per gallon")

Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu horse power pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan horse power yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.

#Visualisasi MPG-Displacement

plot(Auto$displacement, Auto$mpg,pch=19, col="#0000ff",
     xlab="displacement", ylab="mile per gallon")

Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu displacement pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan displacement yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, displacementnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.

#Visualisasi MPG-Weight

plot(Auto$weight, Auto$mpg,pch=19, col="#0000ff",
     xlab="weight", ylab="mile per gallon")

Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu weight pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan weight yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.

pertanyaan 2

tentukan model non-linear terbaik untuk masing pasangan peubah (secara visual dan/atau secara empiris)

MPG-Horsepower

Regresi Polinomial Cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.

set.seed(123)
cv.poly1 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(horsepower,1,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(RegLin) 
}

cv.poly2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(horsepower,2,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly3 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(horsepower,3,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
}

cv.poly4 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(horsepower,4,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly
##          rmse      mae
## [1,] 4.898592 3.835773
## [2,] 4.370631 3.284748
## [3,] 4.372623 3.280150
## [4,] 4.373708 3.288706

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.

poly1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga

set.seed(123)

cv.pc <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")

breaks <- 3:12

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataset[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)
##   breaks     rmse      mae
## 1      8 4.492694 3.419407
## 2     12 4.552770 3.372860

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.

pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

pc12 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,12), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc8, pc12,  labels = c( "pc8", "pc12"), label_size = 10)

Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 8.

Natural Cubic Spline

set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
                  pred <- predict(mod,newdata=Auto[-x$in_id,])
                  truth <- Auto[-x$in_id,]$mpg
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis
##   df     rmse     mae
## 1 10 4.291769 3.23118
## 2 10 4.291769 3.23118
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
set.seed(123)

cross.val <- vfold_cv(Auto,v=5,strata = "mpg")

metric.spline3.ns4 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds



metric.spline3.ns9 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93.5,100,110,140,157.7)),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
                     "NCS9" = mean.metric.spline3.ns9)

nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
                    "9 - NCS (df=10)")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs
##                                Model     rmse      mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 4.381416 3.296385
## NCS9                 9 - NCS (df=10) 4.293696 3.233161

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67,72,80,88,93.5,100,110,140,157.7.

nc4 <- ggplot(Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
  theme_bw()


nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 9 knot lebih dapat menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 9 knot.

Perbandingan Model

perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model
##          rmse      mae               metode
## 1    4.370631 3.284748    Polinomial Ordo 2
## 2    4.492694 3.419407 Piecewise Constant 8
## NCS9 4.293696 3.233161           NCS 9 Knot

Berarti, model terbaik adalah NCS dengan 9 knot.

poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()

plot_grid(poly2, pc8, nc9, labels = c("poly2","pc8", "nc9"), label_size = 6)

MPG-Displacement

Regresi Polinomial

Cross validation untuk menghasilkan pemodelan mpg vs displacement optimal dengan regresi polinomial.

set.seed(123)
cv.poly5 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(displacement,1,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(RegLin) 
}

cv.poly6 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(displacement,2,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly7 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(displacement,3,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
}

cv.poly8 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(displacement,4,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly5(Auto),cv.poly6(Auto),cv.poly7(Auto),cv.poly8(Auto))
cvpoly
##          rmse      mae
## [1,] 4.583160 3.516231
## [2,] 4.304907 3.159841
## [3,] 4.312431 3.170241
## [4,] 4.319214 3.164415

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.

poly1 <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,1,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,4,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Natural Cubic Spline

set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
  metric.spline3 <- map_dfr(cross.val$splits,
                            function(x){
                              mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
                              pred <- predict(mod,newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth,response = pred)
                              mae <- mlr3measures::mae(truth = truth,response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                            }
  )
  metric.spline3
  mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
  mean.metric.spline3
}
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                        best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis
##   df     rmse      mae
## 1 12 4.096341 3.044499
## 2 12 4.096341 3.044499
df_12<-attr(ns(Auto$displacement, df=10),"knots")
df_12
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 
##  90  98 112 122 151 225 250 305 350
set.seed(123)

cross.val <- vfold_cv(Auto,v=5,strata = "mpg")

metric.spline3.ns4 <- map_dfr(cross.val$splits,
                              function(x){
                                mod <- lm(mpg ~ ns(displacement, knots = c(80, 120, 160, 200)),
                                          data=Auto[x$in_id,])
                                pred <- predict(mod,
                                                newdata=Auto[-x$in_id,])
                                truth <- Auto[-x$in_id,]$mpg
                                
                                rmse <- mlr3measures::rmse(truth = truth,
                                                           response = pred
                                )
                                mae <- mlr3measures::mae(truth = truth,
                                                         response = pred
                                )
                                metric <- c(rmse,mae)
                                names(metric) <- c("rmse","mae")
                                return(metric)
                              }
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds



metric.spline3.ns9 <- map_dfr(cross.val$splits,
                              function(x){
                                mod <- lm(mpg ~ ns(displacement, knots = c(90,98,112,122,151,225,250,305,350)),
                                          data=Auto[x$in_id,])
                                pred <- predict(mod,
                                                newdata=Auto[-x$in_id,])
                                truth <- Auto[-x$in_id,]$mpg
                                
                                rmse <- mlr3measures::rmse(truth = truth,
                                                           response = pred
                                )
                                mae <- mlr3measures::mae(truth = truth,
                                                         response = pred
                                )
                                metric <- c(rmse,mae)
                                names(metric) <- c("rmse","mae")
                                return(metric)
                              }
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
                     "NCS9" = mean.metric.spline3.ns9)

nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
                    "9 - NCS (df=10)")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs
##                                Model     rmse      mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 4.200113 3.122310
## NCS9                 9 - NCS (df=10) 4.188001 3.143539

Secara empiris berdasarkan rmse banyaknya knot terbaik adalah 9 knot dan mae banyaknya knot terbaik adalah 4 knot.

nc4 <- ggplot(Auto, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
  theme_bw()


nc9 <- ggplot(Auto, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 4 knot lebih dapat menggambarkan data dibanding dengan 9 knot. Oleh karena itu dipilih 4 knot.

Fungsi Tangga

set.seed(123)

cv.pc2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  
  breaks <- 2:6
  
  best_tangga <- map_dfr(breaks, function(i){
    
    metric_tangga <- map_dfr(cross_val$splits,
                             function(x){
                               
                               training <- dataset[x$in_id,]
                               training$displacement <- cut(training$displacement,i)
                               
                               mod <- lm(mpg ~ displacement,
                                         data=training)
                               
                               labs_x <- levels(mod$model[,2])
                               
                               labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                               
                               
                               testing <- dataset[-x$in_id,]
                               
                               displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                               
                               pred <- predict(mod,
                                               newdata=list(displacement=displacement_new))
                               truth <- testing$mpg
                               
                               data_eval <- na.omit(data.frame(truth,pred))
                               
                               rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                          response = data_eval$pred
                               )
                               mae <- mlr3measures::mae(truth = data_eval$truth,
                                                        response = data_eval$pred
                               )
                               metric <- c(rmse,mae)
                               names(metric) <- c("rmse","mae")
                               return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
  }
  )
  best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
  basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
  return(rbind(basedonrmse,basedonmae))
}
cv.pc2(Auto)
##   breaks     rmse     mae
## 1      6 4.603742 3.40405
## 2      6 4.603742 3.40405

Secara empiris berdasarkan rmse dan mae banyaknya interval terbaik adalah 6.

pc6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

pc6

Perbandingan Model

perbandingan.model <- rbind(cv.poly6(Auto), cv.pc2(Auto)[1,-1], model.ncs[1,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 6","NCS 4 Knot")
perbandingan.model
##          rmse      mae               metode
## 1    4.304907 3.159841    Polinomial Ordo 2
## 2    4.603742 3.404050 Piecewise Constant 6
## NCS4 4.200113 3.122310           NCS 4 Knot

Berarti, model terbaik adalah NCS dengan 4 knot.

poly2 <- ggplot(Auto,aes(x=displacement, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()


pc6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

nc4 <- ggplot(Auto, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
  theme_bw()

plot_grid(poly2, pc6, nc4, labels = c("poly2","pc6", "nc4"), label_size = 6)

yang dibingungkan, pada gambar justru lebih baik resresi polinomial, yang menjadi bahan untuk dipelajari lebih lanjut

MPG-weight

Regresi Polinomial

Cross validation untuk menghasilkan pemodelan mpg vs weight optimal dengan regresi polinomial.

set.seed(123)
cv.poly1 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(weight,1,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(RegLin) 
}

cv.poly2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(weight,2,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}


cv.poly3 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(weight,3,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
}

cv.poly4 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,
                          function(x){  
                            mod <- lm(mpg ~ poly(weight,4,raw = T),
                                      data=dataset[x$in_id,]) 
                            pred <- predict(mod,
                                            newdata=dataset[-x$in_id,]) 
                            truth <- dataset[-x$in_id,]$mpg
                            rmse <- mlr3measures::rmse(truth = truth,response = pred)
                            mae <- mlr3measures::mae(truth = truth,response = pred)
                            metric <- c(rmse,mae) 
                            names(metric) <- c("rmse","mae")
                            return(metric)
                          }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly
##          rmse      mae
## [1,] 4.310464 3.280982
## [2,] 4.150345 3.054677
## [3,] 4.161762 3.060332
## [4,] 4.172705 3.065323

Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.

poly1 <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,1,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,4,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Fungsi Tangga

set.seed(123)

cv.pc <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
  
  breaks <- 3:12
  
  best_tangga <- map_dfr(breaks, function(i){
    
    metric_tangga <- map_dfr(cross_val$splits,
                             function(x){
                               
                               training <- dataset[x$in_id,]
                               training$weight <- cut(training$weight,i)
                               
                               mod <- lm(mpg ~ weight,
                                         data=training)
                               
                               labs_x <- levels(mod$model[,2])
                               
                               labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                               
                               
                               testing <- dataset[-x$in_id,]
                               
                               weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                               
                               pred <- predict(mod,
                                               newdata=list(weight=weight_new))
                               truth <- testing$mpg
                               
                               data_eval <- na.omit(data.frame(truth,pred))
                               
                               rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                          response = data_eval$pred
                               )
                               mae <- mlr3measures::mae(truth = data_eval$truth,
                                                        response = data_eval$pred
                               )
                               metric <- c(rmse,mae)
                               names(metric) <- c("rmse","mae")
                               return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
  }
  )
  best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
  basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
  return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)
##   breaks     rmse      mae
## 1      6 4.248970 3.132424
## 2     12 4.255478 3.116471

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 6 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.

pc6 <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

pc12 <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,12), 
              lty = 1, col = "blue",se = F)+
  theme_bw()


plot_grid(pc6, pc12,  labels = c( "pc6", "pc12"), label_size = 10)

Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 6.

Natural Cubic Spline

set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
  metric.spline3 <- map_dfr(cross.val$splits,
                            function(x){
                              mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
                              pred <- predict(mod,newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth,response = pred)
                              mae <- mlr3measures::mae(truth = truth,response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                            }
  )
  metric.spline3
  mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
  mean.metric.spline3
}
)


best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                        best.spline3 %>% slice_min(mae))) #berdasarkan mae

basis
##   df     rmse      mae
## 1  2 4.149920 3.054651
## 2  7 4.158297 3.048696
df_2<-attr(ns(Auto$weight, df=2),"knots")
df_2
##    50% 
## 2803.5
df_7<-attr(ns(Auto$weight, df=7),"knots")
df_7
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286
set.seed(123)

cross.val <- vfold_cv(Auto,v=5,strata = "mpg")

metric.spline3.ns2 <- map_dfr(cross.val$splits,
                              function(x){
                                mod <- lm(mpg ~ ns(weight, knots = c(2803.5
                                )),
                                          data=Auto[x$in_id,])
                                pred <- predict(mod,
                                                newdata=Auto[-x$in_id,])
                                truth <- Auto[-x$in_id,]$mpg
                                
                                rmse <- mlr3measures::rmse(truth = truth,
                                                           response = pred
                                )
                                mae <- mlr3measures::mae(truth = truth,
                                                         response = pred
                                )
                                metric <- c(rmse,mae)
                                names(metric) <- c("rmse","mae")
                                return(metric)
                              }
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds



metric.spline3.ns7 <- map_dfr(cross.val$splits,
                              function(x){
                                mod <- lm(mpg ~ ns(weight, knots = c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),
                                          data=Auto[x$in_id,])
                                pred <- predict(mod,
                                                newdata=Auto[-x$in_id,])
                                truth <- Auto[-x$in_id,]$mpg
                                
                                rmse <- mlr3measures::rmse(truth = truth,
                                                           response = pred
                                )
                                mae <- mlr3measures::mae(truth = truth,
                                                         response = pred
                                )
                                metric <- c(rmse,mae)
                                names(metric) <- c("rmse","mae")
                                return(metric)
                              }
)
mean.metric.spline3.ns7 <- colMeans(metric.spline3.ns7)# menghitung rata-rata untuk 5 folds



nilai.cv.ns <- rbind("NCS2" = mean.metric.spline3.ns2,
                     "NCS7" = mean.metric.spline3.ns7)

nama.model.ncs <- c("2 - NCS ",
                    "7 - NCS ")

model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)

model.ncs
##         Model     rmse      mae
## NCS2 2 - NCS  4.150152 3.054797
## NCS7 7 - NCS  4.166053 3.056082

Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 2 knot.

nc2 <- ggplot(Auto, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(2803.5)), col = "blue", se = F) +
  theme_bw()


nc7 <- ggplot(Auto, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)), col = "blue", se = F)+
  theme_bw()

plot_grid(nc2, nc7, labels = c("nc2", "nc7"), label_size = 10)

Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 2 knot lebih dapat menggambarkan data dibanding dengan 7 knot. Oleh karena itu dipilih 2 knot.

Perbandingan Model

perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[1,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model
##          rmse      mae               metode
## 1    4.150345 3.054677    Polinomial Ordo 2
## 2    4.248970 3.132424 Piecewise Constant 8
## NCS2 4.150152 3.054797           NCS 9 Knot

Berarti, model terbaik adalah NCS dengan 2 knot.

poly2 <- ggplot(Auto,aes(x=weight, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


pc6 <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

nc2 <- ggplot(Auto, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(2803.5)), col = "blue", se = F) +
  theme_bw()

plot_grid(poly2, pc6, nc2, labels = c("poly2","pc6", "nc2"), label_size = 6)

Kesimpulan

setelah menentukan model non-linear untuk masing pasangan peubah, didapatkan bahwa pada pasangan MPG-Horsepower model yang terbaik adalah NCS dengan 9 knot, kemudian pada pasangan MPG-displacement model yang terbaik adalah NCS dengan 4 knot, dan pada pasangan MPG-weight model yang terbaik adalah NCS dengan 2 knot.