Soal

Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.

Library

library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)
library(rsample)
library(DT)
library(cowplot)

Eksplorasi Data

Data Auto

Data yang digunakan adalah data Auto dari package ISLR, terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang akan digunakan kali ini hanya 4, yaitu:

Auto = Auto %>%  select(mpg, displacement, weight, horsepower)
datatable(Auto)

Visualisasi

ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  theme_bw()

Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan displacement sebagai peubah prediktor.

ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~x, 
              lty = 1, col = "blue",se = F)+
  theme_bw()

Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") + 
  theme_bw() 

Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan horsepower sebagai peubah prediktor.

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") + 
  stat_smooth(method = "lm", 
              formula = y~x, 
              lty = 1, col = "blue",se = F)+
  theme_bw() 

Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.

ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") + 
  theme_bw() 

Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan weight sebagai peubah prediktor.

ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") + 
  stat_smooth(method = "lm", 
              formula = y~x, 
              lty = 1, col = "blue",se = F)+
  theme_bw() 

Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.

Dalam ketiga kasus ini, akan digunakan 3 pendekatan yaitu regresi polinomial, fungsi tangga (piecewise constant), dan natural cubic splines.

mpg vs displacement

Polynomial

set.seed(1233)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:8

best_poly <- map_dfr(degree,function(i){
  metric_poly <- map_dfr(cross_val$splits,
                         function(x){
                           mod <- lm(mpg ~ poly(displacement,i,raw = T),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,
                                           newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           
                           rmse <- mlr3measures::rmse(truth = truth,
                                                      response = pred
                           )
                           mae <- mlr3measures::mae(truth = truth,
                                                    response = pred
                           )
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                         }
  )
  metric_poly
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  
  mean_metric_poly
})

best_poly<- cbind(degree=degree,best_poly)

best_poly
##   degree     rmse      mae
## 1      1 4.637488 3.527734
## 2      2 4.350811 3.177790
## 3      3 4.349441 3.183971
## 4      4 4.355753 3.183893
## 5      5 4.359009 3.191090
## 6      6 4.331887 3.211607
## 7      7 4.281216 3.211554
## 8      8 4.231226 3.210366
#berdasarkan rmse
best_poly %>% slice_min(rmse)
##   degree     rmse      mae
## 1      8 4.231226 3.210366
#berdasarkan mae
best_poly %>% slice_min(mae)
##   degree     rmse     mae
## 1      2 4.350811 3.17779

Secara empiris, jika dilihat dari rmse terlihat bahwa polinomial ordo 8 memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, polinomial ordo 2 memberikan nilai mae terkecil.

#degree=8
ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,8,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 8")+
  theme_bw()

Pada scatter plot regresi polinomial ordo 8, ujung-ujung regresinya liar.

#degree=2
ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
  theme_bw()

Piecewise Constant

set.seed(12433)
cross_val <- vfold_cv(Auto,v=10)

breaks <- 2:13

best_tangga <- map_dfr(breaks, function(i){
  
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$displacement <- cut(training$displacement,i)
                             
                             mod <- lm(mpg ~ displacement,
                                       data=training)
                             
                             labs_x <- levels(mod$model[,2])
                             mod$model
                             labs_x
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                             
                             labs_x_breaks
                             testing <- Auto[-x$in_id,]
                             
                             displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             
                             pred <- predict(mod,
                                             newdata=list(displacement=displacement_new))
                             truth <- testing$mpg
                             
                             data_eval <- na.omit(data.frame(truth,pred))
                             
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                        response = data_eval$pred
                             )
                             mae <- mlr3measures::mae(truth = data_eval$truth,
                                                      response = data_eval$pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_tangga
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  
  mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##    breaks     rmse      mae
## 1       2 5.950709 4.662388
## 2       3 4.927452 3.703130
## 3       4 4.829414 3.635500
## 4       5 4.676728 3.506854
## 5       6 4.612150 3.412363
## 6       7 4.577732 3.369414
## 7       8 4.380313 3.245803
## 8       9 4.217946 3.095100
## 9      10 4.259715 3.139944
## 10     11 4.332656 3.201723
## 11     12 4.416357 3.280542
## 12     13 4.447195 3.305765
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks     rmse    mae
## 1      9 4.217946 3.0951
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks     rmse    mae
## 1      9 4.217946 3.0951

Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi fungsi tangga dengan 8 knots memberikan nilai rmse dan mae terkecil.

#breaks 9
ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 8 knots") +
  theme_bw()

Natural Cubic Splines

set.seed(123321)
cross_val <- vfold_cv(Auto,v=10)

df <- 1:16

best_spline <- map_dfr(df, function(i){
  metric_spline <- map_dfr(cross_val$splits,
                           function(x){
                             mod <- lm(mpg ~ ns(displacement,df=i),
                                       data=Auto[x$in_id,])
                             pred <- predict(mod,
                                             newdata=Auto[-x$in_id,])
                             truth <- Auto[-x$in_id,]$mpg
                             
                             rmse <- mlr3measures::rmse(truth = truth,
                                                        response = pred
                             )
                             mae <- mlr3measures::mae(truth = truth,
                                                      response = pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_spline
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_spline <- colMeans(metric_spline)
  
  mean_metric_spline
}
)

best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
##    df     rmse      mae
## 1   1 4.635647 3.525799
## 2   2 4.353718 3.182011
## 3   3 4.360948 3.188627
## 4   4 4.380593 3.204923
## 5   5 4.327287 3.206944
## 6   6 4.242236 3.138420
## 7   7 4.212878 3.129519
## 8   8 4.206441 3.138094
## 9   9 4.185692 3.141431
## 10 10 4.169330 3.129803
## 11 11 4.116343 3.082791
## 12 12 4.093939 3.052332
## 13 13 4.110197 3.063677
## 14 14 4.131419 3.085322
## 15 15 4.132794 3.082297
## 16 16 4.103554 3.064158
#berdasarkan rmse
best_spline %>% slice_min(rmse)
##   df     rmse      mae
## 1 12 4.093939 3.052332
#berdasarkan mae
best_spline %>% slice_min(mae)
##   df     rmse      mae
## 1 12 4.093939 3.052332

Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa natural cubic splines dengan 11 knots memberikan nilai rmse dan mae terkecil.

#df=12

ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=12), 
              lty = 1,se=F)+
  theme_bw()

Pada scatter plot regresi natural cubic splines dengan 11 knots, ujung-ujung regresinya liar.

poly8<-ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,8,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 8")+
  theme_bw()

poly2<-ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
  theme_bw()

pc8<-ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 8 knots") +
  theme_bw()

nc11<-ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=12), 
              lty = 1,se=F)+
  theme_bw()

plot_grid(poly8,poly2,pc8,nc11,labels=c("poly8","poly2","pc8","nc11"), label_size = 6)

Secara rmse dan mae, natural cubic splines dengan 11 knots memiliki nilai terkecil. Tetapi pada ujung-ujungnya liar, maka secara visual regresi polinomial ordo 2 lebih baik.

mpg vs weight

Polynomial

#POLY#
set.seed(12333)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:8

best_poly <- map_dfr(degree,function(i){
  metric_poly <- map_dfr(cross_val$splits,
                         function(x){
                           mod <- lm(mpg ~ poly(weight,i,raw = T),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,
                                           newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           
                           rmse <- mlr3measures::rmse(truth = truth,
                                                      response = pred
                           )
                           mae <- mlr3measures::mae(truth = truth,
                                                    response = pred
                           )
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                         }
  )
  metric_poly
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  
  mean_metric_poly
})

best_poly<- cbind(degree=degree,best_poly)

best_poly
##   degree     rmse      mae
## 1      1 4.297314 3.291385
## 2      2 4.131395 3.071654
## 3      3 4.136525 3.079694
## 4      4 4.143885 3.084393
## 5      5 4.146502 3.091196
## 6      6 4.157107 3.111192
## 7      7 4.171308 3.127232
## 8      8 4.191974 3.142416
#berdasarkan rmse
best_poly %>% slice_min(rmse)
##   degree     rmse      mae
## 1      2 4.131395 3.071654
#berdasarkan mae
best_poly %>% slice_min(mae)
##   degree     rmse      mae
## 1      2 4.131395 3.071654

Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa polinomial ordo 2 memberikan nilai rmse dan mae terkecil.

#degree=2
ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
  theme_bw()

Piecewise Constant

#Tangga#
set.seed(32331)
cross_val <- vfold_cv(Auto,v=10)

breaks <- 2:13

best_tangga <- map_dfr(breaks, function(i){
  
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$weight <- cut(training$weight,i)
                             
                             mod <- lm(mpg ~ weight,
                                       data=training)
                             
                             labs_x <- levels(mod$model[,2])
                             mod$model
                             labs_x
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                             
                             labs_x_breaks
                             testing <- Auto[-x$in_id,]
                             
                             weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             
                             pred <- predict(mod,
                                             newdata=list(weight=weight_new))
                             truth <- testing$mpg
                             
                             data_eval <- na.omit(data.frame(truth,pred))
                             
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                        response = data_eval$pred
                             )
                             mae <- mlr3measures::mae(truth = data_eval$truth,
                                                      response = data_eval$pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_tangga
  
  
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  
  mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##    breaks     rmse      mae
## 1       2 5.567536 4.349647
## 2       3 4.839053 3.637560
## 3       4 4.713463 3.636524
## 4       5 4.338959 3.216653
## 5       6 4.211518 3.124298
## 6       7 4.336175 3.192620
## 7       8 4.406843 3.270115
## 8       9 4.306461 3.168009
## 9      10 4.279202 3.155884
## 10     11 4.308636 3.188655
## 11     12 4.168457 3.061116
## 12     13 4.206414 3.145069
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks     rmse      mae
## 1     12 4.168457 3.061116
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks     rmse      mae
## 1     12 4.168457 3.061116

Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi fungsi tangga dengan 11 knots memberikan nilai rmse dan mae terkecil.

#breaks 12
ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,12), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 11 knots") +
  theme_bw()

Natural Cubic Splines

set.seed(12332233)
cross_val <- vfold_cv(Auto,v=10)

df <- 1:16

best_spline <- map_dfr(df, function(i){
  metric_spline <- map_dfr(cross_val$splits,
                           function(x){
                             mod <- lm(mpg ~ ns(weight,df=i),
                                       data=Auto[x$in_id,])
                             pred <- predict(mod,
                                             newdata=Auto[-x$in_id,])
                             truth <- Auto[-x$in_id,]$mpg
                             
                             rmse <- mlr3measures::rmse(truth = truth,
                                                        response = pred
                             )
                             mae <- mlr3measures::mae(truth = truth,
                                                      response = pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_spline
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_spline <- colMeans(metric_spline)
  
  mean_metric_spline
}
)





best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
##    df     rmse      mae
## 1   1 4.307677 3.291863
## 2   2 4.137597 3.075156
## 3   3 4.143246 3.076418
## 4   4 4.156762 3.083247
## 5   5 4.167362 3.099487
## 6   6 4.144889 3.093469
## 7   7 4.136167 3.068934
## 8   8 4.147935 3.089412
## 9   9 4.150859 3.078581
## 10 10 4.152071 3.082931
## 11 11 4.148494 3.082637
## 12 12 4.152773 3.092686
## 13 13 4.183303 3.110217
## 14 14 4.165294 3.100449
## 15 15 4.194780 3.119925
## 16 16 4.190865 3.121880
#berdasarkan rmse
best_spline %>% slice_min(rmse)
##   df     rmse      mae
## 1  7 4.136167 3.068934
#berdasarkan mae
best_spline %>% slice_min(mae)
##   df     rmse      mae
## 1  7 4.136167 3.068934

Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi natural cubic splines dengan 6 knots memberikan nilai rmse dan mae terkecil.

#df=7

ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 6 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=7), 
              lty = 1,se=F)+
  theme_bw()

Pada scatter plot regresi natural cubic splines dengan 6 knots, ujung-ujung regresinya liar.

poly2<-ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
  theme_bw()

pc11<-ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,12), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 11 knots") +
  theme_bw()

nc6<-ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 6 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=7), 
              lty = 1,se=F)+
  theme_bw()

plot_grid(poly2,pc11,nc6, labels=c("poly2","pc11","nc6"),label_size = 6)

Jika dilihat dari rmse, polinomial ordo 2 memiliki nilai terkecil. Sedangkan berdasarkan mae, fungsi tangga dengan 11 knots memiliki nilai terkecil.

mpg vs horsepower

Polynomial

set.seed(333)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:10

best_poly <- map_dfr(degree,function(i){
  metric_poly <- map_dfr(cross_val$splits,
                         function(x){
                           mod <- lm(mpg ~ poly(horsepower,i,raw = T),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,
                                           newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           
                           rmse <- mlr3measures::rmse(truth = truth,
                                                      response = pred
                           )
                           mae <- mlr3measures::mae(truth = truth,
                                                    response = pred
                           )
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                         }
  )
  metric_poly
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  
  mean_metric_poly
})

best_poly<- cbind(degree=degree,best_poly)

best_poly
##    degree     rmse      mae
## 1       1 4.888687 3.848017
## 2       2 4.325017 3.269488
## 3       3 4.323753 3.263298
## 4       4 4.330573 3.278704
## 5       5 4.292058 3.246026
## 6       6 4.300889 3.258456
## 7       7 4.293910 3.237582
## 8       8 4.318637 3.256122
## 9       9 4.330224 3.252814
## 10     10 4.390391 3.287209
#berdasarkan rmse
best_poly %>% slice_min(rmse)
##   degree     rmse      mae
## 1      5 4.292058 3.246026
#berdasarkan mae
best_poly %>% slice_min(mae)
##   degree    rmse      mae
## 1      7 4.29391 3.237582

Secara empiris, jika dilihat dari rmse terlihat bahwa polinomial ordo 5 memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, polinomial ordo 7 memberikan nilai mae terkecil.

#degree=5
ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 5")+
  theme_bw()

#degree 7
ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 7")+
  theme_bw()

Piecewise Constant

set.seed(13323)
cross_val <- vfold_cv(Auto,v=10)

breaks <- 2:15

best_tangga <- map_dfr(breaks, function(i){
  
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             
                             mod <- lm(mpg ~ horsepower,
                                       data=training)
                             
                             labs_x <- levels(mod$model[,2])
                             mod$model
                             labs_x
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                             
                             labs_x_breaks
                             testing <- Auto[-x$in_id,]
                             
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             
                             pred <- predict(mod,
                                             newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             
                             data_eval <- na.omit(data.frame(truth,pred))
                             
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                        response = data_eval$pred
                             )
                             mae <- mlr3measures::mae(truth = data_eval$truth,
                                                      response = data_eval$pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_tangga
  
  
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  
  mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##    breaks     rmse      mae
## 1       2 6.137728 4.811516
## 2       3 5.773404 4.540866
## 3       4 4.993003 3.811826
## 4       5 4.688686 3.590960
## 5       6 4.609176 3.541391
## 6       7 4.563583 3.388485
## 7       8 4.496025 3.454513
## 8       9 4.560964 3.493910
## 9      10 4.530724 3.437346
## 10     11 4.519584 3.366985
## 11     12 4.435169 3.329911
## 12     13 4.513995 3.379557
## 13     14 4.433294 3.274231
## 14     15 4.337768 3.294444
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks     rmse      mae
## 1     15 4.337768 3.294444
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks     rmse      mae
## 1     14 4.433294 3.274231

Secara empiris, jika dilihat dari rmse terlihat bahwa regresi fungsi tangga dengan 14 knots memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, regresi fungsi tangga dengan 14 knots memberikan nilai mae terkecil.

#breaks 14
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,14), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 13 knots") +
  theme_bw()

#breaks 15
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,15), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 14 knots") +
  theme_bw()

Natural Cubic Splines

set.seed(3234)
cross_val <- vfold_cv(Auto,v=10)

df <- 1:16

best_spline <- map_dfr(df, function(i){
  metric_spline <- map_dfr(cross_val$splits,
                           function(x){
                             mod <- lm(mpg ~ ns(horsepower,df=i),
                                       data=Auto[x$in_id,])
                             pred <- predict(mod,
                                             newdata=Auto[-x$in_id,])
                             truth <- Auto[-x$in_id,]$mpg
                             
                             rmse <- mlr3measures::rmse(truth = truth,
                                                        response = pred
                             )
                             mae <- mlr3measures::mae(truth = truth,
                                                      response = pred
                             )
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                           }
  )
  
  metric_spline
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_spline <- colMeans(metric_spline)
  
  mean_metric_spline
}
)





best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
##    df     rmse      mae
## 1   1 4.908532 3.861726
## 2   2 4.335138 3.270106
## 3   3 4.370296 3.280222
## 4   4 4.356550 3.272160
## 5   5 4.338281 3.265823
## 6   6 4.325218 3.252178
## 7   7 4.314007 3.234587
## 8   8 4.338375 3.263733
## 9   9 4.326461 3.233684
## 10 10 4.318222 3.218378
## 11 11 4.336301 3.245917
## 12 12 4.317929 3.193365
## 13 13 4.336178 3.233167
## 14 14 4.311247 3.225107
## 15 15 4.330979 3.260707
## 16 16 4.341010 3.269776
#berdasarkan rmse
best_spline %>% slice_min(rmse)
##   df     rmse      mae
## 1 14 4.311247 3.225107
#berdasarkan mae
best_spline %>% slice_min(mae)
##   df     rmse      mae
## 1 12 4.317929 3.193365

Secara empiris, jika dilihat dari rmse terlihat bahwa regresi natural cubic splines dengan 13 knots memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, regresi natural cubic splines dengan 11 knots memberikan nilai mae terkecil.

#df=12
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=12), 
              lty = 1,se=F)+
  theme_bw()

#df=14
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 13 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=14), 
              lty = 1,se=F)+
  theme_bw()

poly5<-ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 5")+
  theme_bw()

poly7<-ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 7")+
  theme_bw()

pc13<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,14), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 13 knots") +
  theme_bw()

pc14<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,15), 
              lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 14 knots") +
  theme_bw()

nc11<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=12), 
              lty = 1,se=F)+
  theme_bw()

nc13<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 13 knots") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=14), 
              lty = 1,se=F)+
  theme_bw()

plot_grid(poly5,poly7, pc13,pc14,nc11,nc13,labels=c("poly5","poly7", "pc13","pc14","nc11","nc13"), label_size = 6)

Jika dilihat dari nilai rmse, polinomial ordo 5 memiliki nilai terkecil. Sedangkan berdasarkan nilai mae, natural cubic dengan 11 knots memiliki nilai terkecil.