Tugas Praktikum STA 581-8 | Model Non Linear


Soal

Data untuk tugas dapat diperoleh di package ISLR. Silahkan install dulu packagenya.

Gunakan model-model non linear yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.

  1. Apakah ada bukti untuk hubungan non linear untuk peubah-peubah yang anda pilih? Buat visualisasi untuk mendukung jawaban Anda.

  2. Tentukan model non-linear terbaik untuk masing pasangan peubah (secara visual dan/atau secara empiris).


Data

Dataset Auto terdiri dari 392 observasi dengan 9 variabel yaitu :

  • mpg (miles per gallon) : penggunaan bahan bakar yang dihitung dalam mpg
  • cylinders : jumlah silinder yang digunakan pada kendaraan
  • displacement : besarnya perpindahan mesin (cu.inches)
  • horsepower : tenaga kuda dari kendaraan
  • weight : berat kendaraan (lbs.)
  • acceleration : percepatan dari 0 ke 60 mph (detik)
  • year : tahun model (modulo 100)
  • origin : asal negara produsen (1 : Amerika, 2 : Eropa, 3 : Jepang)
  • name : nama kendaraan

Peubah respon : mpg

Peubah prediktor : horsepower, displacement, dan weight

library(ISLR)
library(tidyverse)
library(splines)
library(ggplot2)
library(ggpubr)
library(rsample)
library(caret)
library(DT)
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Plot Awal

A <-ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.3, color="orange") +
  theme_bw()+
  ggtitle("Plot horsepower vs mpg") +
  xlab("horsepower") + 
  ylab("mpg")

B <-ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.3, color="purple") +
  theme_bw()+
  ggtitle("Plot displacement vs mpg") +
  xlab("displacement") + 
  ylab("mpg")

C <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=1, color="pink") +
  theme_bw()+
  ggtitle("Plot weight vs mpg") +
  xlab("weight") + 
  ylab("mpg")

ggarrange(A, B, C, ncol = 2, nrow = 2)

Berdasarkan scatter plot di atas, terlihat bahwa sebaran data tidak membentuk pola linear. Hubungan ini akan diuji dengan model regresi non linear, yaitu polinomial, fungsi tangga dan natural cubic spline dan akan ditentukan model terbaik dari masing-masing pasangan peubahnya.


Peubah mpg vs horsepower

Regresi Linier

set.seed(123)
cross_val_h_lin <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_h <- map_dfr(cross_val_h_lin$splits,
    function(x){
mod <- lm(mpg ~ horsepower,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_h
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  4.45  3.49
## 2  5.42  4.22
## 3  4.80  3.64
## 4  4.71  3.89
## 5  5.11  3.94
# menghitung rata-rata untuk 5 folds
mean_metric_linear_h <- colMeans(metric_linear_h)
mean_metric_linear_h
##     rmse      mae 
## 4.898592 3.835773
Lh <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.3, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "black",se = F)+
  theme_bw()+
  ggtitle("Linear") +
  xlab("horsepower") + 
  ylab("mpg")
Lh

Regresi Polinomial

# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5, strata = "mpg") #rsample

ordo <- 2:5
best_poly_hrspwr <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame) 
    Polinomial_hrspwr <- map_dfr(cross_val_hrspwr$splits, 
                                 function(x){
    
    mod <- lm(mpg ~ poly(horsepower,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
    pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

Polinomial_hrspwr

# menghitung rata-rata untuk 5 folds
mean_Polinomial_hrspwr <- colMeans(Polinomial_hrspwr)
mean_Polinomial_hrspwr
  
}
)

# menampilkan hasil all ordo
best_poly_hrspwr <- cbind(ordo=ordo,best_poly_hrspwr)
best_poly_hrspwr
##   ordo     rmse      mae
## 1    2 4.370631 3.284748
## 2    3 4.372623 3.280150
## 3    4 4.373708 3.288706
## 4    5 4.338150 3.271598
# berdasarkan rmse
best_poly_hrspwr %>% slice_min(rmse)
##   ordo    rmse      mae
## 1    5 4.33815 3.271598
# berdasarkan mae
best_poly_hrspwr %>% slice_min(mae)
##   ordo    rmse      mae
## 1    5 4.33815 3.271598

Plot CV

best_poly_hrspwr$ordo <- as.factor(best_poly_hrspwr$ordo)
ggplot(data=best_poly_hrspwr, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Ordo") + 
  ylab("RMSE")+
  theme_bw()

Secara empiris regresi polinomial dengan ordo 5 memiliki RMSE dan MAE terkecil. Namun, dipilih regresi polinomial dengan ordo 2 karena untuk ordo 5 pada bagian ujung-ujung plot cenderung tidak stabil.

Ph2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.3, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "green3",se = F)+
  theme_bw()+
  ggtitle("Polinomial Ordo 2") +
  xlab("horsepower") + 
  ylab("mpg") 

Ph5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.3, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "green3",se = F)+
  theme_bw()+
  ggtitle("Polinomial Ordo 5") +
  xlab("horsepower") + 
  ylab("mpg")

ggarrange(Ph2, Ph5, ncol = 2)

set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5, strata = "mpg") 

    Polinomial_hrspwr2 <- map_dfr(cross_val_hrspwr$splits, 
       function(x){
    mod <- lm(mpg ~ poly(horsepower,2,raw=T),data=Auto[x$in_id,])
    pred <- predict(mod,newdata=Auto[-x$in_id,])
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

Polinomial_hrspwr2
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  3.80  2.78
## 2  4.78  3.65
## 3  4.19  3.08
## 4  4.01  3.13
## 5  5.07  3.78
# menghitung rata-rata untuk 5 folds
mean_Polinomial_hrspwr2 <- colMeans(Polinomial_hrspwr2)
mean_Polinomial_hrspwr2
##     rmse      mae 
## 4.370631 3.284748

Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5,strata = "mpg")

breaks <- 2:8  # pembagian interval 2 sampai 8

best_ftangga_hrspwr <- map_dfr(breaks, 
      function(i){
  
ftangga_hrspwr <- map_dfr(cross_val_hrspwr$splits, 
      function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i) #Transformasi jadi factor
mod <- lm(mpg ~ horsepower,data=training)

#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", 
                              "\\1", labs_x) ),
                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", 
                              "\\1", labs_x) ))

#Testing
testing <- Auto[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
                  
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}

)
ftangga_hrspwr
  
# menghitung rata-rata untuk 5 folds
ftangga_hrspwr <- colMeans(ftangga_hrspwr)
ftangga_hrspwr
  
}
)
best_ftangga_hrspwr <- cbind(breaks=breaks, best_ftangga_hrspwr)

# menampilkan hasil all breaks
best_ftangga_hrspwr
##   breaks     rmse      mae
## 1      2 6.119241 4.756365
## 2      3 5.788482 4.522081
## 3      4 5.026946 3.778564
## 4      5 4.737723 3.588164
## 5      6 4.728788 3.573371
## 6      7 4.590146 3.411292
## 7      8 4.492694 3.419407
#berdasarkan rmse
best_ftangga_hrspwr %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      8 4.492694 3.419407
#berdasarkan mae
best_ftangga_hrspwr %>% slice_min(mae)
##   breaks     rmse      mae
## 1      7 4.590146 3.411292

Berdasarkan RMSE terkecil banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan MAE terkecil banyaknya interval terbaik adalah 7.

Plot CV

T1 <- ggplot(data=best_ftangga_hrspwr, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Breaks") + 
  ylab("RMSE")+
  theme_bw()

T2 <- ggplot(data=best_ftangga_hrspwr, aes(x=breaks, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Breaks") + 
  ylab("MAE")+
  theme_bw()

ggarrange(T1, T2, ncol = 2, nrow = 1)

Th <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.3, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8),lty = 1,
              col = "red",se = F)+
  theme_bw()+
  ggtitle("Fungsi Tangga 8 knots") +
  xlab("horsepower") + 
  ylab("mpg") 
Th

Natural Cubic Spline

set.seed(123)
cross_val_h_ns <- vfold_cv(Auto,v=5,strata = "mpg")

df <- 2:15

best_nspline_h <- map_dfr(df, function(i){
  nspline_h <- map_dfr(cross_val_h_ns$splits, function(x){
    mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
    pred <- predict(mod, newdata=Auto[-x$in_id,])
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )
  nspline_h

# menghitung rata-rata untuk 5 folds
mean_nspline_h <- colMeans(nspline_h)
mean_nspline_h
}
)
best_nspline_h <- cbind(df=df,best_nspline_h)

# menampilkan hasil all breaks
best_nspline_h
##    df     rmse      mae
## 1   2 4.350969 3.264158
## 2   3 4.358924 3.270071
## 3   4 4.349080 3.276399
## 4   5 4.334825 3.266389
## 5   6 4.335422 3.264572
## 6   7 4.315948 3.260216
## 7   8 4.326330 3.272321
## 8   9 4.305634 3.245872
## 9  10 4.291769 3.231180
## 10 11 4.320656 3.253788
## 11 12 4.320773 3.236359
## 12 13 4.346547 3.266437
## 13 14 4.378758 3.292366
## 14 15 4.373600 3.303746
#berdasarkan rmse
best_nspline_h %>% slice_min(rmse)
##   df     rmse     mae
## 1 10 4.291769 3.23118
#berdasarkan mae
best_nspline_h %>% slice_min(mae)
##   df     rmse     mae
## 1 10 4.291769 3.23118

Plot CV

best_nspline_h$df <- as.factor(best_nspline_h$df)
ggplot(data=best_nspline_h, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("DF") + 
  ylab("RMSE")+
  theme_bw()

Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 10 knots.

Secara Visual

Knots

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
NSh <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,col = "blue", se=F)+
  theme_bw()+
  ggtitle("Natural Cubic Splines (df=10)") +
  xlab("horsepower") + 
  ylab("mpg")+
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140,157.7), col="grey50", lty=2)
NSh

Perbandingan Model

Secara Empiris berdasarkan nilai RMSE dan MAE

nilai_metric_h <- rbind(mean_metric_linear_h,
                  mean_Polinomial_hrspwr2,
                  best_ftangga_hrspwr %>% select(-1) %>% slice_min(rmse),
                  best_nspline_h %>% select(-1)%>% slice_min(rmse)
                  )

nama_model_regresi_h <- c("Linear",
                          "Polinomial ordo 2",
                          "Fungsi Tangga (breaks=8)",
                          "Natural Cubic Splines (df=10)"
                          )

komparasi <- data.frame(nama_model_regresi_h,nilai_metric_h)
komparasi %>%  datatable(komparasi, class = 'cell-border stripe')

Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs horsepower adalah Natural Cubic Spline dengan 10 Knot.

Secara Visual berdasarkan Plot

ggarrange(Lh, Ph2, Th, NSh, ncol = 2, nrow = 2)

plot_all_horsepower <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.3, color="orange") +
  
  stat_smooth(method = "lm", 
              formula = y~x,
              lty = 1, col = "black",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "green",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "red",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,col = "blue", se = F)+
  
  labs(x = "horsepower" , y = "mpg") +
  ggtitle(expression(atop("Gabungan Model Terbaik", 
                          atop(italic("mpg vs horsepower"))))) +
  theme(plot.title = element_text(hjust = 0.5, vjust = -2))

plot_all_horsepower


Peubah mpg vs displacement

Regresi Linier

set.seed(123)
cross_val_lin_d <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_d <- map_dfr(cross_val_lin_d$splits,
    function(x){
mod <- lm(mpg ~ displacement,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_d
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  4.40  3.29
## 2  5.83  4.45
## 3  4.46  3.48
## 4  4.51  3.55
## 5  3.72  2.80
# menghitung rata-rata untuk 5 folds
mean_metric_linear_d <- colMeans(metric_linear_d)
mean_metric_linear_d
##     rmse      mae 
## 4.583160 3.516231
Ld <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.3, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "black",se = F)+
  theme_bw()+
  ggtitle("Linear") +
  xlab("displacement") + 
  ylab("mpg")
Ld

Regresi Polinomial

# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_dis <- vfold_cv(Auto,v=5, strata = "mpg") #rsample

ordo <- 2:5
best_poly_dis <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame) 
    Polinomial_dis <- map_dfr(cross_val_dis$splits, 
                                 function(x){
    
    mod <- lm(mpg ~ poly(displacement,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
    pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

Polinomial_dis

# menghitung rata-rata untuk 5 folds
mean_Polinomial_dis <- colMeans(Polinomial_dis)
mean_Polinomial_dis
  
}
)

# menampilkan hasil all ordo
best_poly_dis <- cbind(ordo=ordo,best_poly_dis)
best_poly_dis
##   ordo     rmse      mae
## 1    2 4.304907 3.159841
## 2    3 4.312431 3.170241
## 3    4 4.319214 3.164415
## 4    5 4.339744 3.183127
# berdasarkan rmse
best_poly_dis %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    2 4.304907 3.159841
# berdasarkan mae
best_poly_dis %>% slice_min(mae)
##   ordo     rmse      mae
## 1    2 4.304907 3.159841

Plot CV

best_poly_dis$ordo <- as.factor(best_poly_dis$ordo)
ggplot(data=best_poly_dis, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Ordo") + 
  ylab("RMSE")+
  theme_bw()

Secara empiris, regresi polinomial dengan ordo 2 memiliki RMSE dan MAE terkecil.

Pd2 <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.3, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "green3",se = F)+
  theme_bw()+
  ggtitle("Polinomial Ordo 2") +
  xlab("displacement") + 
  ylab("mpg") 

Pd2

Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val_ft_dis <- vfold_cv(Auto,v=5,strata = "mpg")

breaks <-2:8  # pembagian interval 2 sampai 8

best_tangga_ft_dis <- map_dfr(breaks, function(i){
  metric_tangga_ft_dis <- map_dfr(cross_val_ft_dis$splits, function(x){
    
    training <- Auto[x$in_id,]
    training$displacement <- cut(training$displacement,i) #Transformasi jadi factor
mod <- lm(mpg ~ displacement, data=training)

#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
    
    labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))

#Testing
testing <- Auto[-x$in_id,]
    
    displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
    
    pred <- predict(mod, newdata=list(displacement=displacement_new))
    truth <- testing$mpg
    
    data_eval <- na.omit(data.frame(truth,pred))
    
    rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
    
    mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
    
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

  metric_tangga_ft_dis
  
  # menghitung rata-rata untuk 5 folds
  mean_metric_tangga_ft_dis <- colMeans(metric_tangga_ft_dis)
  
  mean_metric_tangga_ft_dis
  
  })
best_tangga_ft_dis <- cbind(breaks=breaks,best_tangga_ft_dis)

# menampilkan hasil all breaks
best_tangga_ft_dis
##   breaks     rmse      mae
## 1      2 5.925600 4.629407
## 2      3 4.893745 3.675196
## 3      4 4.824438 3.623216
## 4      5 4.666040 3.487272
## 5      6 4.603742 3.404050
## 6      7 4.588973 3.382947
## 7      8 4.405202 3.250663
#berdasarkan rmse
best_tangga_ft_dis %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      8 4.405202 3.250663
#berdasarkan mae
best_tangga_ft_dis %>% slice_min(mae)
##   breaks     rmse      mae
## 1      8 4.405202 3.250663

Berdasarkan RMSE dan MAE terkecil, banyaknya interval yang terbaik adalah 8.

Plot CV

T1 <- ggplot(data=best_tangga_ft_dis, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Breaks") + 
  ylab("RMSE")+
  theme_bw()
T1

Td <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.3, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8),lty = 1,
              col = "red",se = F)+
  theme_bw()+
  ggtitle("Fungsi Tangga 8 knots") +
  xlab("displacement") + 
  ylab("mpg") 
Td

Natural Cubic Spline

set.seed(123)
cross_val_d_ns <- vfold_cv(Auto,v=5,strata = "mpg")

df <- 2:15

best_nspline_d <- map_dfr(df, function(i){
  nspline_d <- map_dfr(cross_val_d_ns$splits, function(x){
    mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
    pred <- predict(mod, newdata=Auto[-x$in_id,])
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )
  nspline_d

# menghitung rata-rata untuk 5 folds
mean_nspline_d <- colMeans(nspline_d)
mean_nspline_d
}
)
best_nspline_d <- cbind(df=df,best_nspline_d)

# menampilkan hasil all breaks
best_nspline_d
##    df     rmse      mae
## 1   2 4.301604 3.163136
## 2   3 4.313449 3.172620
## 3   4 4.351866 3.186249
## 4   5 4.323274 3.204052
## 5   6 4.248683 3.147266
## 6   7 4.233862 3.143808
## 7   8 4.214122 3.153459
## 8   9 4.202560 3.144546
## 9  10 4.182256 3.137854
## 10 11 4.153097 3.099830
## 11 12 4.096341 3.044499
## 12 13 4.154786 3.102888
## 13 14 4.192029 3.128335
## 14 15 4.235035 3.154566
#berdasarkan rmse
best_nspline_d %>% slice_min(rmse)
##   df     rmse      mae
## 1 12 4.096341 3.044499
#berdasarkan mae
best_nspline_d %>% slice_min(mae)
##   df     rmse      mae
## 1 12 4.096341 3.044499

Plot CV

best_nspline_d$df <- as.factor(best_nspline_d$df)
ggplot(data=best_nspline_d, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("DF") + 
  ylab("RMSE")+
  theme_bw()

Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 12 knots.

Secara Visual

Knots

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$displacement, df=12),"knots")
## 8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
##   89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
##       75% 83.33333% 91.66667% 
##  275.7500  318.0000  351.0000
NSd <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm", 
               formula = y~ns(x, df=12), 
               lty = 1,col = "blue", se=F)+
  theme_bw()+
  ggtitle("Natural Cubic Splines (df=12)") +
  xlab("displacement") + 
  ylab("mpg")+
  geom_vline(xintercept = c(89, 97, 105, 119, 130.9, 151, 200, 232, 275.75, 318, 351), col="grey50", lty=2)
NSd

Perbandingan Model

Secara Empiris berdasarkan nilai RMSE dan MAE

nilai_metric_d <- rbind(mean_metric_linear_d,
                        best_poly_dis %>% select(-1) %>% slice_min(rmse),
                        best_tangga_ft_dis %>% select(-1) %>% slice_min(rmse),
                        best_nspline_d %>% select(-1) %>% slice_min(rmse)
                  )

nama_model_regresi_d <- c("Linear",
                          "Polinomial ordo 2",
                          "Fungsi Tangga (breaks=8)",
                          "Natural Cubic Splines (df=12)"
                          )

komparasi <- data.frame(nama_model_regresi_d,nilai_metric_d)
komparasi %>%  datatable(komparasi, class = 'cell-border stripe')

Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs displacement adalah Natural Cubic Spline dengan 12 Knot.

Secara Visual berdasarkan Plot

ggarrange(Ld, Pd2, Td, NSd, ncol = 2, nrow = 2)

plot_all_displacement<- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.3, color="purple") +
  
  stat_smooth(method = "lm", 
              formula = y~x,
              lty = 1, col = "black",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "green",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "red",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=12), 
               lty = 1,col = "blue", se = F)+
  
  labs(x = "displacement" , y = "mpg") +
  ggtitle(expression(atop("Gabungan Model Terbaik", 
                          atop(italic("mpg vs displacement"))))) +
  theme(plot.title = element_text(hjust = 0.5, vjust = -2))

plot_all_displacement


Peubah mpg vs weight

Regresi Linear

set.seed(123)
cross_val_lin_w <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_w <- map_dfr(cross_val_lin_w$splits,
    function(x){
mod <- lm(mpg ~ weight,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_w
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  4.15  3.08
## 2  4.90  3.76
## 3  4.39  3.31
## 4  4.17  3.08
## 5  3.93  3.17
# menghitung rata-rata untuk 5 folds
mean_metric_linear_w <- colMeans(metric_linear_w)
mean_metric_linear_w
##     rmse      mae 
## 4.310464 3.280982
Lw <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=1, color="pink") +
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "black",se = F)+
  theme_bw()+
  ggtitle("Linear") +
  xlab("weight") + 
  ylab("mpg")
Lw

Regresi Polinomial

# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_w <- vfold_cv(Auto,v=5, strata = "mpg") #rsample

ordo <- 2:5
best_poly_w <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame) 
    Polinomial_w <- map_dfr(cross_val_w$splits, 
                                 function(x){
    
    mod <- lm(mpg ~ poly(weight,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
    pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

Polinomial_w

# menghitung rata-rata untuk 5 folds
mean_Polinomial_w <- colMeans(Polinomial_w)
mean_Polinomial_w
  
}
)

# menampilkan hasil all ordo
best_poly_w <- cbind(ordo=ordo,best_poly_w)
best_poly_w
##   ordo     rmse      mae
## 1    2 4.150345 3.054677
## 2    3 4.161762 3.060332
## 3    4 4.172705 3.065323
## 4    5 4.179309 3.077552
# berdasarkan rmse
best_poly_w %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    2 4.150345 3.054677
# berdasarkan mae
best_poly_w %>% slice_min(mae)
##   ordo     rmse      mae
## 1    2 4.150345 3.054677

Plot CV

best_poly_w$ordo <- as.factor(best_poly_w$ordo)
ggplot(data=best_poly_w, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Ordo") + 
  ylab("RMSE")+
  theme_bw()

Secara empiris, regresi polinomial dengan ordo 2 memiliki RMSE dan MAE terkecil.

Pw2 <- ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=1, color="pink") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "green3",se = F)+
  theme_bw()+
  ggtitle("Polinomial Ordo 2") +
  xlab("weight") + 
  ylab("mpg") 

Pw2

Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val_ft_w <- vfold_cv(Auto,v=5,strata = "mpg")

breaks <-2:8  # pembagian interval 2 sampai 8

best_tangga_ft_w <- map_dfr(breaks, function(i){
  metric_tangga_ft_w <- map_dfr(cross_val_ft_w$splits, function(x){
    
    training <- Auto[x$in_id,]
    training$weight <- cut(training$weight,i) #Transformasi jadi factor
mod <- lm(mpg ~ weight, data=training)

#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
    
    labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))

#Testing
testing <- Auto[-x$in_id,]
    
    weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
    
    pred <- predict(mod, newdata=list(weight=weight_new))
    truth <- testing$mpg
    
    data_eval <- na.omit(data.frame(truth,pred))
    
    rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
    
    mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
    
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )

  metric_tangga_ft_w
  
  # menghitung rata-rata untuk 5 folds
  mean_metric_tangga_ft_w <- colMeans(metric_tangga_ft_w)
  
  mean_metric_tangga_ft_w
  
  })
best_tangga_ft_w <- cbind(breaks=breaks,best_tangga_ft_w)

# menampilkan hasil all breaks
best_tangga_ft_w
##   breaks     rmse      mae
## 1      2 5.606147 4.344818
## 2      3 4.881005 3.658741
## 3      4 4.717345 3.647426
## 4      5 4.478443 3.263110
## 5      6 4.248970 3.132424
## 6      7 4.293833 3.176184
## 7      8 4.326547 3.228837
#berdasarkan rmse
best_tangga_ft_w %>% slice_min(rmse)
##   breaks    rmse      mae
## 1      6 4.24897 3.132424
#berdasarkan mae
best_tangga_ft_w %>% slice_min(mae)
##   breaks    rmse      mae
## 1      6 4.24897 3.132424

Berdasarkan RMSE dan MAE terkecil, banyaknya interval yang terbaik adalah 6.

Plot CV

T1 <- ggplot(data=best_tangga_ft_w, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Breaks") + 
  ylab("RMSE")+
  theme_bw()
T1

Tw <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=1, color="pink") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6),lty = 1,
              col = "red",se = F)+
  theme_bw()+
  ggtitle("Fungsi Tangga 6 knots") +
  xlab("weight") + 
  ylab("mpg") 
Tw

Natural Cubic Spline

set.seed(123)
cross_val_w_ns <- vfold_cv(Auto,v=5,strata = "mpg")

df <- 2:15

best_nspline_w <- map_dfr(df, function(i){
  nspline_w <- map_dfr(cross_val_w_ns$splits, function(x){
    mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
    pred <- predict(mod, newdata=Auto[-x$in_id,])
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
    )
  nspline_w

# menghitung rata-rata untuk 5 folds
mean_nspline_w <- colMeans(nspline_w)
mean_nspline_w
}
)
best_nspline_w <- cbind(df=df,best_nspline_w)

# menampilkan hasil all breaks
best_nspline_w
##    df     rmse      mae
## 1   2 4.149920 3.054651
## 2   3 4.172693 3.062742
## 3   4 4.179401 3.069074
## 4   5 4.192673 3.087995
## 5   6 4.199590 3.101134
## 6   7 4.158297 3.048696
## 7   8 4.189938 3.088689
## 8   9 4.206900 3.095412
## 9  10 4.220522 3.115617
## 10 11 4.210792 3.109537
## 11 12 4.204752 3.108552
## 12 13 4.229605 3.130726
## 13 14 4.244783 3.152284
## 14 15 4.237106 3.145465
#berdasarkan rmse
best_nspline_w %>% slice_min(rmse)
##   df    rmse      mae
## 1  2 4.14992 3.054651
#berdasarkan mae
best_nspline_w %>% slice_min(mae)
##   df     rmse      mae
## 1  7 4.158297 3.048696

Berdasarkan RMSE terkecil banyaknya knot yang terbaik adalah 2 sedangkan berdasarkan MAE terkecil banyaknya knot terbaik adalah 7.

Plot CV

w1 <- ggplot(data=best_nspline_w, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("DF") + 
  ylab("RMSE")+
  theme_bw()

w2 <- ggplot(data=best_nspline_w, aes(x=df, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("DF") + 
  ylab("MAE")+
  theme_bw()

ggarrange(w1, w2, ncol =2)

Secara Visual

Knots

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$weight, df=2),"knots")
##    50% 
## 2803.5
attr(ns(Auto$weight, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286
NSw2 <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,col = "blue", se=F)+
  theme_bw()+
  ggtitle("Natural Cubic Splines (df=2)") +
  xlab("weight") + 
  ylab("mpg")+
  geom_vline(xintercept = c(2803.5), col="grey50", lty=2)
NSw2

NSw7 <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1,col = "blue", se=F)+
  theme_bw()+
  ggtitle("Natural Cubic Splines (df=7)") +
  xlab("weight") + 
  ylab("mpg")+
  geom_vline(xintercept = c(2074.857, 2285.429,  2635.000,  2986.571,  3446.143,  4096.286 ), col="grey50", lty=2)
NSw7

Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 2 knots.

Perbandingan Model

Secara Empiris berdasarkan nilai RMSE dan MAE

nilai_metric_w <- rbind(mean_metric_linear_w,
                        best_poly_w %>% select(-1) %>% slice_min(rmse),
                        best_tangga_ft_w %>% select(-1) %>% slice_min(rmse),
                        best_nspline_w %>% select(-1) %>% slice_min(rmse)
                        )

nama_model_regresi_w <- c("Linear",
                          "Polinomial ordo 2",
                          "Fungsi Tangga (breaks=6)",
                          "Natural Cubic Splines (df=2)"
                          )

komparasi <- data.frame(nama_model_regresi_w,nilai_metric_w)
komparasi %>%  datatable(komparasi, class = 'cell-border stripe')

Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs weight adalah Natural Cubic Spline dengan 2 Knot.

Secara Visual berdasarkan Plot

ggarrange(Lw, Pw2, Tw, NSw2, ncol = 2, nrow = 2)

plot_all_weight<- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=1, color="pink") +
  
  stat_smooth(method = "lm", 
              formula = y~x,
              lty = 1, col = "black",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "green",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "red",se = F)+
  
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=12), 
               lty = 1,col = "blue", se = F)+
  
  labs(x = "weight" , y = "mpg") +
  ggtitle(expression(atop("Gabungan Model Terbaik", 
                          atop(italic("mpg vs weight"))))) +
  theme(plot.title = element_text(hjust = 0.5, vjust = -2))

plot_all_weight


Kesimpulan

Plot model terbaik untuk masing-masing variabel prediktor:

ggarrange(NSh, NSd, NSw2, ncol = 2, nrow = 2)

Model regresi non linear terbaik untuk masing-masing pasangan peubah:

  • Variabel respon mpg dan variabel prediktor horsepower : Regresi Natural Cubic Spline (df = 10)
  • Variabel respon mpg dan variabel prediktor displacement : Regresi Natural Cubic Spline (df = 12)
  • Variabel respon mpg dan variabel prediktor weight : Regresi Natural Cubic Spline (df = 2)

Terima kasih.