Tugas Praktikum 8 STA581

Deri Siswara G1501211036

2021-10-29

Soal

Data bisa diperoleh untuk tugas dapat diperoleh di package ISLR. Silahkan Install dulu packagenya.

Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.

Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.

Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)

library(tidyverse)
library(splines)
library(ISLR)
library(ggplot2)
library(rsample)
library(caret)
head(Auto)
  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500

mpg Vs horsepower

Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.

Scatter Plot mpg Vs horsepower

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") + 
                 theme_bw() 

Regresi Linier

set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ horsepower,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.92  3.98
 2  5.31  3.77
 3  4.44  3.44
 4  4.55  3.71
 5  5.43  4.47
 6  4.75  3.75
 7  5.25  4.37
 8  4.83  3.92
 9  5.16  3.75
10  4.34  3.18
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
    rmse      mae 
4.898189 3.834290 

Regresi Polinomial

polinomial = function(dataset){
  modelterbaik = c()
  derajat = 1:10
  for (i in derajat){
    set.seed(1501211036)
    cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
    metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame) 
                           function(x){  
                             mod = lm(mpg ~ poly(horsepower,i,raw = T), 
                                       data=dataset[x$in_id,]) #in_id a/ data training
                             pred = predict(mod, 
                                             newdata=dataset[-x$in_id,])
                             # nilai prediksi untuk data testing
                             truth = dataset[-x$in_id,]$mpg
                             rmse = mlr3measures::rmse(truth = truth,
                                                        response = pred)
                             mae = mlr3measures::mae(truth = truth,
                                                      response = pred)
                             metric = c(rmse,mae) 
                             names(metric) = c("rmse","mae")
                             return(metric)
                           }
    )
    rata_rata = colMeans(metric_poly)
    modelterbaik = c(modelterbaik,rata_rata)
  }
  M = matrix(modelterbaik, byrow = T,ncol = 2)
  colnames(M) = c("rmse","mae")
  bestmpoly = as.data.frame(cbind(derajat,M))
  bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
  return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly
   derajat     rmse      mae
1        7 4.328586 3.239488
2        5 4.343273 3.248301
3        6 4.345171 3.262537
4        2 4.347103 3.263367
5        8 4.351402 3.260481
6        3 4.354304 3.271791
7        4 4.374685 3.290646
8        9 4.380581 3.269677
9       10 4.415139 3.287987
10       1 4.898189 3.834290

Secara empiris regresi polinomial dengan derajat 7 memiliki rmse maupun mae terkecil. Namun, di sini dipilih regresi polinomial dengan derajat 2 karena jika lebih dari dejarat 4, akan ada indikasi maalasah overfitting.

Hal ini karena meskipun memiliki rmse atau mae yang kecil, polinomial dengan derajat lebih besar dari 4 cenderung memiliki pola yang tidak teratur, khususnya pada ujung-ujung kurvanya.

Fungsi Tangga (Piecewise Constant)

ftangga = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:15
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$horsepower = cut(training$horsepower,i) #Transformasi jadi factor
          
                  mod = lm(mpg ~ horsepower,data=training)
                  
                  #Convert ke numeric lagi
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  #Testing
                  testing = dataset[-x$in_id,]
                  horsepower_new = cut(testing$horsepower,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(horsepower=horsepower_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}
tangga = ftangga(Auto)
tangga
   breaks     rmse      mae
1      15 4.343245 3.291338
2      14 4.401883 3.257563
3       8 4.436932 3.405170
4      12 4.451700 3.314559
5       7 4.499764 3.359511
6      13 4.511050 3.357282
7       9 4.526533 3.447286
8      11 4.538542 3.350069
9      10 4.578024 3.435559
10      6 4.655029 3.532977
11      5 4.705711 3.568395
12      4 4.990599 3.790641
13      3 5.771438 4.520390
14      2 6.151124 4.792086

Berdasarkan rmse terkecil banyaknya interval yang terbaik adalah 15 sedangkan berdasarkan mae terkecil banyaknya interval terbaik adalah 14.

Natural Cubic Spline

ncspline = function(dataset){
  set.seed(1501211036)
  cross.val = vfold_cv(dataset,v=10,strata = "mpg")
  df = 3:20
  best.spline3 = map_dfr(df, function(i){
    metric.spline3 = map_dfr(cross.val$splits,
                              function(x){
                                mod = lm(mpg ~ splines::ns(horsepower,df=i),data=dataset[x$in_id,])
                                pred = predict(mod,newdata=dataset[-x$in_id,])
                                truth = dataset[-x$in_id,]$mpg
                                rmse = mlr3measures::rmse(truth = truth,
                                                          response = pred)
                                mae = mlr3measures::mae(truth = truth,
                                                        response = pred)
                                metric = c(rmse,mae)
                                names(metric) = c("rmse","mae")
                                return(metric)
                              }
    )
    metric.spline3
    mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
    mean.metric.spline3
  }
  )
  best.spline3 = cbind(df=df,best.spline3)
  basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                          best.spline3 %>% slice_min(mae))) #berdasarkan mae

  df=basis[,"df"]
  knot1 = attr(ns(dataset$horsepower, df=df[1]),"knots")
  knot2 = attr(ns(dataset$horsepower, df=df[2]),"knots")
  return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto
$basis
  df     rmse      mae
1  7 4.320925 3.230518
2 10 4.323890 3.217920

$knot1
14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
 68.85714  79.71429  89.57143  98.00000 113.57143 150.00000 

$knot2
  10%   20%   30%   40%   50%   60%   70%   80%   90% 
 67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7 

Berdasarkan hasil di atas model natural spline terbaik memiliki 7 fungsi basis atau 6 knots

Perbandingan Model

a = as.matrix(bestmpoly[4,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 15 Interval",
                                  "Natural Cubic Spline 6 Knot", "Model Linier")
as.data.frame(perbandingan.model)
                                rmse      mae
Polinomial Derajat 2        4.347103 3.263367
Fungsi Tangga 15 Interval   4.343245 3.291338
Natural Cubic Spline 6 Knot 4.320925 3.230518
Model Linier                4.898189 3.834290

Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs horsepower adalah Natural Cubic Spline 6 Knot

mpg Vs displacement

Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.

Scatter Plot mpg Vs displacement

ggplot(Auto,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") + 
                 theme_bw() 

Regresi Linier

set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ displacement,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.79  3.72
 2  4.80  3.39
 3  4.22  3.01
 4  4.18  3.53
 5  4.90  3.51
 6  5.01  3.96
 7  4.97  4.04
 8  4.05  3.29
 9  4.68  3.48
10  4.64  3.20
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
    rmse      mae 
4.624689 3.512835 

Regresi Polinomial

polinomial = function(dataset){
  modelterbaik = c()
  derajat = 1:10
  for (i in derajat){
    set.seed(1501211036)
    cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
    metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame) 
                           function(x){  
                             mod = lm(mpg ~ poly(displacement,i,raw = T), 
                                       data=dataset[x$in_id,]) #in_id a/ data training
                             pred = predict(mod, 
                                             newdata=dataset[-x$in_id,])
                             # nilai prediksi untuk data testing
                             truth = dataset[-x$in_id,]$mpg
                             rmse = mlr3measures::rmse(truth = truth,
                                                        response = pred)
                             mae = mlr3measures::mae(truth = truth,
                                                      response = pred)
                             metric = c(rmse,mae) 
                             names(metric) = c("rmse","mae")
                             return(metric)
                           }
    )
    rata_rata = colMeans(metric_poly)
    modelterbaik = c(modelterbaik,rata_rata)
  }
  M = matrix(modelterbaik, byrow = T,ncol = 2)
  colnames(M) = c("rmse","mae")
  bestmpoly = as.data.frame(cbind(derajat,M))
  bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
  return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly
   derajat     rmse      mae
1       10 4.162502 3.119952
2        9 4.203314 3.147555
3        8 4.253503 3.209042
4        7 4.299290 3.202670
5        6 4.353357 3.210716
6        2 4.356654 3.167928
7        3 4.358329 3.175692
8        4 4.368521 3.180040
9        5 4.371482 3.185649
10       1 4.624689 3.512835

Secara empiris regresi polinomial dengan derajat 7 memiliki rmse maupun mae terkecil. Namun, di sini dipilih regresi polinomial dengan derajat 2 karena jika lebih dari dejarat 4, akan ada indikasi maalasah overfitting.

Hal ini karena meskipun memiliki rmse atau mae yang kecil, polinomial dengan derajat lebih besar dari 4 cenderung memiliki pola yang tidak teratur, khususnya pada ujung-ujung kurvanya.

Fungsi Tangga (Piecewise Constant)

ftangga = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:10
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$displacement = cut(training$displacement,i) #Transformasi jadi factor
          
                  mod = lm(mpg ~ displacement,data=training)
                  
                  #Convert ke numeric lagi
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  #Testing
                  testing = dataset[-x$in_id,]
                  displacement_new = cut(testing$displacement,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(displacement=displacement_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}
tangga = ftangga(Auto)
tangga
  breaks     rmse      mae
1      9 4.237568 3.093370
2     10 4.277541 3.132205
3      8 4.393760 3.236799
4      7 4.614666 3.384960
5      6 4.641657 3.409467
6      5 4.715635 3.508383
7      4 4.854343 3.630170
8      3 4.941093 3.679058
9      2 5.932837 4.636662

Berdasarkan rmse dan mae terkecil banyaknya interval yang terbaik adalah 9.

Natural Cubic Spline

ncspline = function(dataset){
  set.seed(1501211036)
  cross.val = vfold_cv(dataset,v=10,strata = "mpg")
  df = 3:20
  best.spline3 = map_dfr(df, function(i){
    metric.spline3 = map_dfr(cross.val$splits,
                              function(x){
                                mod = lm(mpg ~ splines::ns(displacement,df=i),data=dataset[x$in_id,])
                                pred = predict(mod,newdata=dataset[-x$in_id,])
                                truth = dataset[-x$in_id,]$mpg
                                rmse = mlr3measures::rmse(truth = truth,
                                                          response = pred)
                                mae = mlr3measures::mae(truth = truth,
                                                        response = pred)
                                metric = c(rmse,mae)
                                names(metric) = c("rmse","mae")
                                return(metric)
                              }
    )
    metric.spline3
    mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
    mean.metric.spline3
  }
  )
  best.spline3 = cbind(df=df,best.spline3)
  basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                          best.spline3 %>% slice_min(mae))) #berdasarkan mae

  df=basis[,"df"]
  knot1 = attr(ns(dataset$displacement, df=df[1]),"knots")
  knot2 = attr(ns(dataset$displacement, df=df[2]),"knots")
  return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto
$basis
  df    rmse      mae
1 12 4.12221 3.064286
2 12 4.12221 3.064286

$knot1
8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
  89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
      75% 83.33333% 91.66667% 
 275.7500  318.0000  351.0000 

$knot2
8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
  89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
      75% 83.33333% 91.66667% 
 275.7500  318.0000  351.0000 

Berdasarkan hasil di atas model natural spline terbaik memiliki 12 fungsi basis atau 11 knots

Perbandingan Model

a = as.matrix(bestmpoly[6,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 9 Interval",
                                  "Natural Cubic Spline 11 Knot", "Model Linier")
as.data.frame(perbandingan.model)
                                 rmse      mae
Polinomial Derajat 2         4.356654 3.167928
Fungsi Tangga 9 Interval     4.237568 3.093370
Natural Cubic Spline 11 Knot 4.122210 3.064286
Model Linier                 4.624689 3.512835

Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs displacement adalah Natural Cubic Spline 11 Knot

mpg Vs weight

Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.

Scatter Plot mpg Vs weight

head(Auto)
  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500
ggplot(Auto,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") + 
                 theme_bw() 

Regresi Linier

set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ weight,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.48  3.56
 2  4.46  3.21
 3  4.25  3.29
 4  3.49  2.69
 5  4.60  3.34
 6  4.74  3.65
 7  4.27  3.48
 8  4.14  3.33
 9  4.46  3.39
10  4.37  2.98
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
    rmse      mae 
4.326813 3.293198 

Regresi Polinomial

polinomial = function(dataset){
  modelterbaik = c()
  derajat = 1:10
  for (i in derajat){
    set.seed(1501211036)
    cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
    metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame) 
                           function(x){  
                             mod = lm(mpg ~ poly(weight,i,raw = T), 
                                       data=dataset[x$in_id,]) #in_id a/ data training
                             pred = predict(mod, 
                                             newdata=dataset[-x$in_id,])
                             # nilai prediksi untuk data testing
                             truth = dataset[-x$in_id,]$mpg
                             rmse = mlr3measures::rmse(truth = truth,
                                                        response = pred)
                             mae = mlr3measures::mae(truth = truth,
                                                      response = pred)
                             metric = c(rmse,mae) 
                             names(metric) = c("rmse","mae")
                             return(metric)
                           }
    )
    rata_rata = colMeans(metric_poly)
    modelterbaik = c(modelterbaik,rata_rata)
  }
  M = matrix(modelterbaik, byrow = T,ncol = 2)
  colnames(M) = c("rmse","mae")
  bestmpoly = as.data.frame(cbind(derajat,M))
  bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
  return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly
   derajat     rmse      mae
1        2 4.171196 3.073073
2        3 4.178035 3.081339
3        4 4.185834 3.078166
4        5 4.186407 3.085107
5        6 4.191242 3.106216
6        7 4.203473 3.118155
7        8 4.217516 3.122607
8        9 4.258438 3.160485
9       10 4.292933 3.159856
10       1 4.326813 3.293198

Secara empiris regresi polinomial dengan derajat 72 memiliki rmse maupun mae terkecil.

Fungsi Tangga (Piecewise Constant)

ftangga = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:10
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$weight = cut(training$weight,i) #Transformasi jadi factor
          
                  mod = lm(mpg ~ weight,data=training)
                  
                  #Convert ke numeric lagi
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  #Testing
                  testing = dataset[-x$in_id,]
                  weight_new = cut(testing$weight,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(weight=weight_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}
tangga = ftangga(Auto)
tangga
  breaks     rmse      mae
1      6 4.225604 3.126638
2      7 4.312179 3.174580
3      9 4.317814 3.153454
4     10 4.330555 3.176004
5      8 4.402451 3.226718
6      5 4.415531 3.261200
7      4 4.648455 3.561651
8      3 4.876442 3.661495
9      2 5.540988 4.297736

Berdasarkan rmse dan mae terkecil banyaknya interval yang terbaik adalah 6.

Natural Cubic Spline

ncspline = function(dataset){
  set.seed(1501211036)
  cross.val = vfold_cv(dataset,v=10,strata = "mpg")
  df = 3:20
  best.spline3 = map_dfr(df, function(i){
    metric.spline3 = map_dfr(cross.val$splits,
                              function(x){
                                mod = lm(mpg ~ splines::ns(weight,df=i),data=dataset[x$in_id,])
                                pred = predict(mod,newdata=dataset[-x$in_id,])
                                truth = dataset[-x$in_id,]$mpg
                                rmse = mlr3measures::rmse(truth = truth,
                                                          response = pred)
                                mae = mlr3measures::mae(truth = truth,
                                                        response = pred)
                                metric = c(rmse,mae)
                                names(metric) = c("rmse","mae")
                                return(metric)
                              }
    )
    metric.spline3
    mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
    mean.metric.spline3
  }
  )
  best.spline3 = cbind(df=df,best.spline3)
  basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                          best.spline3 %>% slice_min(mae))) #berdasarkan mae

  df=basis[,"df"]
  knot1 = attr(ns(dataset$weight, df=df[1]),"knots")
  knot2 = attr(ns(dataset$weight, df=df[2]),"knots")
  return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto
$basis
  df     rmse      mae
1 11 4.177159 3.071718
2 11 4.177159 3.071718

$knot1
9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727% 
 1985.000  2130.000  2264.636  2491.818  2676.364  2936.273  3228.364  3521.818 
81.81818% 90.90909% 
 3896.545  4317.909 

$knot2
9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727% 
 1985.000  2130.000  2264.636  2491.818  2676.364  2936.273  3228.364  3521.818 
81.81818% 90.90909% 
 3896.545  4317.909 

Berdasarkan hasil di atas model natural spline terbaik memiliki 11 fungsi basis atau 10 knots

Perbandingan Model

a = as.matrix(bestmpoly[1,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 6 Interval",
                                  "Natural Cubic Spline 10 Knot", "Model Linier")
as.data.frame(perbandingan.model)
                                 rmse      mae
Polinomial Derajat 2         4.171196 3.073073
Fungsi Tangga 6 Interval     4.225604 3.126638
Natural Cubic Spline 10 Knot 4.177159 3.071718
Model Linier                 4.326813 3.293198

Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs weight adalah Natural Cubic Spline 10 Knot