Tugas Kuliah 8

Deri Siswara G1501211036

2021-10-26

Soal

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya:

Reg polinomial

Piecewise constant

Natural cubil splines

Lakukan hal di atas untuk masing-masing sub-set data berdasarkan asal negara produsennya origin (Amerika, Eropa, Jepang).

Plot model terbaik dalam satu frame.

Berikan ulasan terhadap hasil Anda.

library(tidyverse)
library(splines)
library(ISLR)
library(ggplot2)
library(rsample)
head(Auto)
  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500

Pada tugas ini hanya digunakan 3 variabel yaitu mpg, horsepower, dan origin.

mpg : miles per gallon

horsepower: Engine horsepower

origin: Origin of car (1. American, 2. European, 3. Japanese)

Scatter Plot mpg Vs horsepower

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") + 
                 theme_bw() 

Regresi Linier

model_linear=lm(mpg~horsepower,data=Auto)
ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.50, color="blue") + 
        
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "red",se = F)+
  theme_bw() 

Hubungan linear tidak dapat merepresentrasikan sebaran data dengan baik. Nilai horsepower yang lebih tinggi lagi dapat berakibat mpg bernilai nol atau negatif. Hal Ini tidak mungkin. Oleh karena itu, beberapa model yang dapat digunakan untuk masalah di atas yaitu Regresi polinomial, Piecewise constant, atau Natural cubic splines.

Regresi Polinomial

Empiris

polinomial = function(dataset){
  modelterbaik = c()
  derajat = 1:10
  for (i in derajat){
    set.seed(1501211036)
    cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
    metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame) 
                           function(x){  
                             mod = lm(mpg ~ poly(horsepower,i,raw = T), 
                                       data=dataset[x$in_id,]) #in_id a/ data training
                             pred = predict(mod, 
                                             newdata=dataset[-x$in_id,])
                             # nilai prediksi untuk data testing
                             truth = dataset[-x$in_id,]$mpg
                             rmse = mlr3measures::rmse(truth = truth,
                                                        response = pred)
                             mae = mlr3measures::mae(truth = truth,
                                                      response = pred)
                             metric = c(rmse,mae) 
                             names(metric) = c("rmse","mae")
                             return(metric)
                           }
    )
    rata_rata = colMeans(metric_poly)
    modelterbaik = c(modelterbaik,rata_rata)
  }
  M = matrix(modelterbaik, byrow = T,ncol = 2)
  colnames(M) = c("rmse","mae")
  bestmpoly = as.data.frame(cbind(derajat,M))
  bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
  return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly
   derajat     rmse      mae
1        7 4.328586 3.239488
2        5 4.343273 3.248301
3        6 4.345171 3.262537
4        2 4.347103 3.263367
5        8 4.351402 3.260481
6        3 4.354304 3.271791
7        4 4.374685 3.290646
8        9 4.380581 3.269677
9       10 4.415139 3.287987
10       1 4.898189 3.834290

Secara empiris regresi polinomial dengan derajat 7 memiliki rmse maupun mae terkecil. Namun, di sini dipilih regresi polinomial dengan derajat 2 karena jika lebih dariu dejarat 4, akan ada indikasi maalaahoverfitting.

Hal ini karena meskipun memiliki rmse atau mae yang kecil, polinomial dengan derajat lebih besar dari 4 cenderung memiliki pola yang tidak teratur, khususnya pada ujung-ujung kurvanya. Untuk lebih jelas, dapat dilihat kurva berikut.

Visual

ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Derajat 2") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Derajat 3") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,7,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Derajat 7") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,8,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Derajat 8") +
  theme_bw()

Fungsi Tangga (Piecewise Constant)

Empiris

ftangga = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:15
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$horsepower = cut(training$horsepower,i) #Transformasi jadi factor
          
                  mod = lm(mpg ~ horsepower,data=training)
                  
                  #Convert ke numeric lagi
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  #Testing
                  testing = dataset[-x$in_id,]
                  horsepower_new = cut(testing$horsepower,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(horsepower=horsepower_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}
tangga = ftangga(Auto)
tangga
   breaks     rmse      mae
1      15 4.343245 3.291338
2      14 4.401883 3.257563
3       8 4.436932 3.405170
4      12 4.451700 3.314559
5       7 4.499764 3.359511
6      13 4.511050 3.357282
7       9 4.526533 3.447286
8      11 4.538542 3.350069
9      10 4.578024 3.435559
10      6 4.655029 3.532977
11      5 4.705711 3.568395
12      4 4.990599 3.790641
13      3 5.771438 4.520390
14      2 6.151124 4.792086

Berdasarkan rmse terkecil banyaknya interval yang terbaik adalah 15 sedangkan berdasarkan mae terkecil banyaknya interval terbaik adalah 14.

visual

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "red",se = F)+
  ggtitle("Cut = 8") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,12), 
               lty = 1, col = "red",se = F)+
  ggtitle("Cut = 12") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,14), 
               lty = 1, col = "red",se = F)+
  ggtitle("Cut = 14") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,15), 
               lty = 1, col = "red",se = F)+
  ggtitle("Cut = 15") +
  theme_bw()

Berdasarkan empiris dan visual, diputuskan model terbaik dengan interbal sebanyak 15.

Natural Cubic Spline

Empiris

Menentukan Banyaknya Fungsi Basis

ncspline = function(dataset){
  set.seed(1501211036)
  cross.val = vfold_cv(dataset,v=10,strata = "mpg")
  df = 3:20
  best.spline3 = map_dfr(df, function(i){
    metric.spline3 = map_dfr(cross.val$splits,
                              function(x){
                                mod = lm(mpg ~ splines::ns(horsepower,df=i),data=dataset[x$in_id,])
                                pred = predict(mod,newdata=dataset[-x$in_id,])
                                truth = dataset[-x$in_id,]$mpg
                                rmse = mlr3measures::rmse(truth = truth,
                                                          response = pred)
                                mae = mlr3measures::mae(truth = truth,
                                                        response = pred)
                                metric = c(rmse,mae)
                                names(metric) = c("rmse","mae")
                                return(metric)
                              }
    )
    metric.spline3
    mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
    mean.metric.spline3
  }
  )
  best.spline3 = cbind(df=df,best.spline3)
  basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                          best.spline3 %>% slice_min(mae))) #berdasarkan mae

  df=basis[,"df"]
  knot1 = attr(ns(dataset$horsepower, df=df[1]),"knots")
  knot2 = attr(ns(dataset$horsepower, df=df[2]),"knots")
  return(list(basis=basis,knot1=knot1,knot2=knot2))
}

basisAuto = ncspline(Auto)
basisAuto
$basis
  df     rmse      mae
1  7 4.320925 3.230518
2 10 4.323890 3.217920

$knot1
14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
 68.85714  79.71429  89.57143  98.00000 113.57143 150.00000 

$knot2
  10%   20%   30%   40%   50%   60%   70%   80%   90% 
 67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7 

Berdasarkan hasil di atas model natural spline terbaik memiliki 7 fungsi basis atau 6 knots

Visual

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(68.8, 79.7, 89.5, 98, 
                                           113.5, 150)), 
               lty = 1,col = "red",se=F)+
  ggtitle("knot = 6")+
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(67.0, 72.0,80.0, 88.0, 93.5, 
                                           100.0,110.0,140.0,157.7)), 
               lty = 1,col = "red",se=F)+
  ggtitle("knot = 9")+
  theme_bw()

Perbandingan Model

Empiris

a = as.matrix(bestmpoly[4,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
perbandingan.model = rbind(a,b,c)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 15 Interval",
                                  "Natural Cubic Spline 6 Knot")
as.data.frame(perbandingan.model[,-1])
                                rmse      mae
Polinomial Derajat 2        4.347103 3.263367
Fungsi Tangga 15 Interval   4.343245 3.291338
Natural Cubic Spline 6 Knot 4.320925 3.230518

Visual

ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Polinomial Derajat 2") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,15), 
               lty = 1, col = "red",se = F)+
  ggtitle("Fungsi Tangga 15 Interval") +
  theme_bw()

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(68.8, 79.7, 89.5, 98, 
                                           113.5, 150)), 
               lty = 1,col = "red",se=F)+
  ggtitle("Natural Cubic Spline 6 Knot")+
  theme_bw()

Berdasarkan grafik Regresi Polinomial Derajat 2 terlihat lebih baik dibandingkan Natural Cubic Spline yang mana di ujungnya cenderung tidak stabil meskipun memiliki rmse dan mea yang lebih kecil. Sehingga dipilih Regresi Polinomial Derajat 2.

Subset-data: Amerika

Regresi Polinomial

polyUS = polinomial(Auto[Auto$origin==1,]) 
polyUS
   derajat     rmse      mae
1        3 3.772860 2.897686
2        7 3.773650 2.870675
3        2 3.776399 2.889935
4        5 3.776736 2.887701
5        4 3.788496 2.902062
6        8 3.788557 2.882283
7        6 3.802001 2.913044
8        9 3.864852 2.932634
9       10 3.902782 2.946496
10       1 4.197004 3.332211

Fungsi Tangga

ftUS = ftangga(Auto[Auto$origin==1,]) 
ftUS
   breaks     rmse      mae
1       9 3.796899 2.880849
2      12 3.814466 2.868998
3      11 3.861857 2.957272
4      13 3.865669 2.922589
5      14 3.909457 2.960533
6      10 3.920842 2.985516
7      15 3.945686 2.998896
8       8 3.958049 2.957988
9       5 4.056622 3.134471
10      7 4.095050 3.208085
11      4 4.118081 3.040479
12      6 4.131736 3.112986
13      3 4.732121 3.666719
14      2 4.895683 3.764649

Natural Cubic Spline

ncsUS = ncspline(Auto[Auto$origin==1,]) 
ncsUS
$basis
  df     rmse      mae
1 20 3.658698 2.762393
2 20 3.658698 2.762393

$knot1
   5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60%   65% 
 70.0  78.4  83.6  85.0  88.0  90.0  92.0  99.2 100.0 105.0 110.0 122.0 138.6 
  70%   75%   80%   85%   90%   95% 
145.0 150.0 150.0 162.0 175.0 197.0 

$knot2
   5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60%   65% 
 70.0  78.4  83.6  85.0  88.0  90.0  92.0  99.2 100.0 105.0 110.0 122.0 138.6 
  70%   75%   80%   85%   90%   95% 
145.0 150.0 150.0 162.0 175.0 197.0 

Perbandingan

a = as.matrix(polyUS[1,])
b = as.matrix(ftUS[1,])
c = as.matrix(ncsUS$basis[1,])
perbandingan.model = rbind(a,b,c)
rownames(perbandingan.model) = c("Polinomial Derajat 3","Fungsi Tangga 9 Interval",
                                  "Natural Cubic Spline 19 Knot")
as.data.frame(perbandingan.model[,-1])
                                 rmse      mae
Polinomial Derajat 3         3.772860 2.897686
Fungsi Tangga 9 Interval     3.796899 2.880849
Natural Cubic Spline 19 Knot 3.658698 2.762393

Visual

ggplot(Auto[Auto$origin==1,],aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Polinomial Derajat 3") +
  theme_bw()

ggplot(Auto[Auto$origin==1,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,9), 
               lty = 1, col = "red",se = F)+
  ggtitle("Fungsi Tangga 9 Interval") +
  theme_bw()

ggplot(Auto[Auto$origin==1,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(70.0, 78.4, 83.6, 85.0, 88.0, 90.0, 
                                           92.0, 99.2, 100.0, 105.0, 110.0, 
                                           122.0, 138.6, 145.0, 150.0, 150.0,
                                           162.0, 175.0, 197.0 )), 
               lty = 1,col = "red",se=F)+
  ggtitle("Natural Cubic Spline 19 Knot")+
  theme_bw()

Subset-data: Eropa

Regresi Polinomial

polyEU = polinomial(Auto[Auto$origin==2,]) 
polyEU
   derajat      rmse      mae
1        1  4.844209 3.815299
2        2  4.943533 3.875699
3        3  4.989606 3.908970
4        4  5.061847 3.980574
5        5  5.427366 4.303481
6        8  6.620034 4.674904
7        7  6.640237 4.743828
8        6  6.688415 4.900119
9        9 16.371862 8.625718
10      10 16.371862 8.625718

Fungsi Tangga

ftangga2 = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:8
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$horsepower = cut(training$horsepower,i)
          
                  mod = lm(mpg ~ horsepower,data=training)
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  testing = dataset[-x$in_id,]
                  horsepower_new = cut(testing$horsepower,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(horsepower=horsepower_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}

ftEU = ftangga2(Auto[Auto$origin==2,]) 
ftEU
  breaks     rmse      mae
1      5 5.055263 3.970937
2      7 5.092098 3.963517
3      8 5.236106 4.159720
4      6 5.246888 4.092438
5      2 5.374675 4.264126
6      4 5.380581 4.335569
7      3 5.439791 4.390227

Natural Cubic Spline

ncsEU = ncspline(Auto[Auto$origin==2,]) 
ncsEU
$basis
  df     rmse      mae
1  3 5.001043 3.937553
2  3 5.001043 3.937553

$knot1
33.33333% 66.66667% 
       71        87 

$knot2
33.33333% 66.66667% 
       71        87 

Perbandingan

a = as.matrix(polyEU[1,])
b = as.matrix(ftEU[1,])
c = as.matrix(ncsEU$basis[1,])
perbandingan.model = rbind(a,b,c)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 5 Interval",
                                  "Natural Cubic Spline 2 Knot")
as.data.frame(perbandingan.model[,-1])
                                rmse      mae
Polinomial Derajat 2        4.844209 3.815299
Fungsi Tangga 5 Interval    5.055263 3.970937
Natural Cubic Spline 2 Knot 5.001043 3.937553

Visual

ggplot(Auto[Auto$origin==2,],aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Polinomial Derajat 2") +
  theme_bw()

ggplot(Auto[Auto$origin==2,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "red",se = F)+
  ggtitle("Fungsi Tangga 5 Interval") +
  theme_bw()

ggplot(Auto[Auto$origin==2,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(71, 87  )), 
               lty = 1,col = "red",se=F)+
  ggtitle("Natural Cubic Spline 2 Knot")+
  theme_bw()

Subset-data: Jepang

Regresi Polinomial

polyJP = polinomial(Auto[Auto$origin==3,]) 
polyJP
   derajat      rmse      mae
1        3  3.955656 2.998039
2        6  3.987467 3.049899
3        4  4.069171 3.097866
4        1  4.247876 3.400810
5        2  4.352083 3.396524
6        5  4.598146 3.378687
7        7  5.009655 3.508123
8        8 13.618007 6.364834
9        9 22.733143 9.415607
10      10 22.733143 9.415607

Fungsi Tangga

ftangga3 = function(dataset){
  set.seed(1501211036)
  cross_val = vfold_cv(dataset,v=10,strata = "mpg")
  breaks = 2:5
  best_tangga = map_dfr(breaks, function(i){
    metric_tangga = map_dfr(cross_val$splits,
          function(x){
                  training = dataset[x$in_id,]
                  training$horsepower = cut(training$horsepower,i)
          
                  mod = lm(mpg ~ horsepower,data=training)
                  labs_x = levels(mod$model[,2])
                               
                  labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*", 
                                                              "\\1", labs_x) ),
                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                                                       "\\1", labs_x) ))
                  testing = dataset[-x$in_id,]
                  horsepower_new = cut(testing$horsepower,c(labs_x_breaks[1,1],
                                                             labs_x_breaks[,2]))
                  pred = predict(mod,
                          newdata=list(horsepower=horsepower_new))
                  truth = testing$mpg
                               
                  data_eval = na.omit(data.frame(truth,pred))
                               
                  rmse = mlr3measures::rmse(truth = data_eval$truth,
                                           response = data_eval$pred)
                  mae = mlr3measures::mae(truth = data_eval$truth,
                                             response = data_eval$pred)
                  metric = c(rmse,mae)
                  names(metric) = c("rmse","mae")
                  return(metric)
                             }
    )
    colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
  }
  )
  best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
  best_tangga = arrange(best_tangga, best_tangga$rmse)
  return(best_tangga)
}
ftJP = ftangga3(Auto[Auto$origin==3,]) 
ftJP
  breaks     rmse      mae
1      4 4.118326 3.267480
2      3 4.134643 3.314040
3      5 4.172699 3.365922
4      2 4.416488 3.322464

Natural Cubic Spline

ncsJP = ncspline(Auto[Auto$origin==3,]) 
ncsJP
$basis
  df     rmse      mae
1  7 3.993262 3.010772
2  7 3.993262 3.010772

$knot1
14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
 65.00000  67.00000  70.00000  75.00000  93.71429  97.00000 

$knot2
14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
 65.00000  67.00000  70.00000  75.00000  93.71429  97.00000 

Perbandingan

a = as.matrix(polyJP[1,])
b = as.matrix(ftJP[1,])
c = as.matrix(ncsJP$basis[1,])
perbandingan.model = rbind(a,b,c)
rownames(perbandingan.model) = c("Polinomial Derajat 3","Fungsi Tangga 4 Interval",
                                  "Natural Cubic Spline 6 Knot")
as.data.frame(perbandingan.model[,-1])
                                rmse      mae
Polinomial Derajat 3        3.955656 2.998039
Fungsi Tangga 4 Interval    4.118326 3.267480
Natural Cubic Spline 6 Knot 3.993262 3.010772

Visual

ggplot(Auto[Auto$origin==3,],aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  ggtitle("Polinomial Derajat 3") +
  theme_bw()

ggplot(Auto[Auto$origin==3,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,4), 
               lty = 1, col = "red",se = F)+
  ggtitle("Fungsi Tangga 4 Interval") +
  theme_bw()

ggplot(Auto[Auto$origin==3,],aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c( 65,  67, 70, 75,  
                                            93.7, 97 )), 
               lty = 1,col = "red",se=F)+
  ggtitle("Natural Cubic Spline 6 Knot")+
  theme_bw()