Regresi Non Linier

Annebel D Clarissa

4/20/2021

Package yang digunakan

library(ISLR)         # Data Auto
library(ggplot2)      # Visualisasi data
library(DT)           # Tabel
library(rsample)      # cross validation
library(tidyverse)    # fs mapdfr
library(mlr3measures) # perhitungan rmse dan mae
library(splines)      # spline
library(gridExtra)    # Grafik
library(grid)         # Grafik

Data

Data yang digunakan adalah data Auto pada package ISLR. Data Auto adalah sebuah data frame dengan 392 observasi dan 9 variabel. Berikut adalah data Auto :

Selanjutnya, hanya akan digunakan 2 variabel yaitu mpg (miles per gallon) dan horse power untuk melihat hubungan dari kedua variabel tersebut.

Visualisasi Data

plot(horsepower, mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="Miles per Gallon")

Terlihat data Auto tidak mengikuti pola linier. Sehingga jika dipaksakan menggunakan model linier akan menimbulkan beberapa masalah, terutama dalam hal prediksi.

Data Auto dengan Model Linier

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "red",se = F)+
  theme_bw()


Sehingga untuk mengatasi data non linier dapat digunakan beberapa model seperti :

  • Regresi Polinomial
  • Piecewise Constant (Fungsi Tangga)
  • Natural Cubic Splines
  • dll

Untuk masing-masing model akan dicari parameter yang optimum dengan cross validation

Regresi Polinomial

Akan dicari pangkat yang optimal untuk regresi polinomial dengan Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

order <- 2:4

best_poly <- map_dfr(order, function(i){
metric_poly <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,i),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly

# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)

mean_metric_poly
}
)

best_poly <- cbind(order,best_poly)
# menampilkan hasil all breaks
datatable(best_poly)
#berdasarkan rmse
datatable(best_poly %>% slice_min(rmse))

Piecewise Constant

set.seed(123)

cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:20

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
datatable(best_tangga)
#berdasarkan rmse
datatable(best_tangga %>% slice_min(rmse))

Natural Cubic Splines

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:20

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)
#berdasarkan rmse
datatable(best_spline3 %>% slice_min(rmse))

Model Terbaik

Dari ketiga model yang sudah ditentukan parameter optimalnya, akan ditentukan mana yang terbaik bagi data Auto yang berdasarkan pada nilai RMSE. Model terbaik adalah model dengan RMSE terendah.

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_spline3 %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c(" Reg Polinomial (degree=2) ",
"Piecewise Constant (knot=15)",
                "Natural Cubic Splines (df=10)"
)

best_model <- data.frame(nama_model,nilai_metric)

datatable(best_model)

Sehingga, model yang memiliki RMSE terendah adalah Model Natural Cubic Splines dengan df=10.

#berdasarkan rmse
datatable(best_model %>% slice_min(rmse))

Model Final - Data Auto

Mencari letak knot dengan df=10

attr(ns(Auto$horsepower,df=10), "knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,se = F) +
  ggtitle("Natural Cubic Splines (df=10) - Data Auto")

regspline = lm(mpg ~ ns(horsepower, knots=c(67,72,80,88,93.5,100,110,140,157.7)))
prediksi = predict(regspline , data.frame(horsepower), interval="predict" )
plot(horsepower, mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="Natural Cubic Splines dengan df=10 - Data Auto")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)
abline(v=c(67,72,80,88,93.5,100,110,140,157.7), col="grey50", lty=2)

Subsetting Data

Data Auto akan dibagi berdasarkan asal negara produsen yang ada pada kolom origin. Dengan pembagian :

Kode 1 : Amerika Kode 2 : Eropa Kode 3 : Jepang

Berikut adalah pembagian masing-masing subset data :

dataAmerika <- Auto %>% filter(origin ==1)
dataEropa   <- Auto %>% filter(origin ==2)
dataJepang  <- Auto %>% filter(origin ==3)

Untuk masing-masing subset data akan dipilih model terbaik diantara Model regresi polinomial, Piecewise Constant, dan Natural Cubic Spline dengan parameter optimumnya.

Data Amerika

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")

order <- 2:4

best_poly_amerika <- map_dfr(order, function(i){
metric_poly_amerika <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,i),
         data=dataAmerika[x$in_id,])
pred <- predict(mod,
               newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly_amerika

# menghitung rata-rata untuk 10 folds
mean_metric_poly_amerika <- colMeans(metric_poly_amerika)

mean_metric_poly_amerika
}
)

best_poly_amerika <- cbind(order,best_poly_amerika)
# menampilkan hasil all breaks

#berdasarkan rmse
datatable(best_poly_amerika %>% slice_min(rmse))

Piecewise Constant

set.seed(123)

cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")

breaks <- 2:20

best_tangga_amerika <- map_dfr(breaks, function(i){

metric_tangga_amerika <- map_dfr(cross_val$splits,
    function(x){

training <- dataAmerika[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataAmerika[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_amerika

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_amerika <- colMeans(metric_tangga_amerika)

mean_metric_tangga_amerika
})

best_tangga_amerika <- cbind(breaks=breaks,best_tangga_amerika)
# menampilkan hasil all breaks

#berdasarkan rmse
datatable(best_tangga_amerika %>% slice_min(rmse))

Natural Cubic Spline

set.seed(123)
cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")

df <- 2:10

best_spline_amerika <- map_dfr(df, function(i){
metric_spline_amerika <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=dataAmerika[x$in_id,])
pred <- predict(mod,
               newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_amerika

# menghitung rata-rata untuk 10 folds
mean_metric_spline_amerika <- colMeans(metric_spline_amerika)

mean_metric_spline_amerika
}
)

best_spline_amerika <- cbind(df=df,best_spline_amerika)
# menampilkan hasil all breaks

#berdasarkan rmse
datatable(best_spline_amerika %>% slice_min(rmse))

Model Terbaik - Data Amerika

Berikut adalah model-model terbaik dari hasil CV untuk data Amerika

ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
  
  geom_point(alpha=0.55, color="coral") +
  
  stat_smooth(aes(colour="Polinomial d=2"),method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Fungsi Tangga k=11"),method = "lm", 
               formula = y~cut(x,11), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Natural Cubic Spline df=7"),method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1,se = F) +
  
  
  theme_bw() +
  labs(title="Perbandingan Model Terbaik - Data Amerika") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 

nilai_metric_amerika <- rbind(best_poly_amerika %>% select(-1) %>% slice_min(rmse),
                      best_tangga_amerika %>% select(-1) %>% slice_min(rmse),
                      best_spline_amerika %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Reg Polinomial(degree=2)",
"Piecewise Constant (knot=11)",
                "Natural Cubic Splines(df=7)"
)

best_model_amerika <- data.frame(nama_model,nilai_metric_amerika)

datatable(best_model_amerika)
#berdasarkan rmse
datatable(best_model_amerika %>% slice_min(rmse))

Dari output diatas dapat diambil kesimpulan bahwa :

  • Jika menggunakan model polinomial, degree optimal ada pada degree=2

  • Jika menggunakan model piecewise constant, knot optimal ada pada knot=11

  • Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=7

Dan model terbaik dari ketiganya untuk data Amerika adalah dengan Model Piecewise Constant dengan knot 11

Model Final - Data Amerika

ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,11), 
               lty = 1, col = "blue",se = F)+
  theme_bw() +
  ggtitle("Piecewise Constant (11 Knot) - Data Amerika") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") + 
  theme(plot.title = element_text(hjust = 0.5))

Data Eropa

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
order <- 2:4

best_poly_eropa <- map_dfr(order, function(i){
metric_poly_eropa <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,i),
         data=dataEropa[x$in_id,])
pred <- predict(mod,
               newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly_eropa

# menghitung rata-rata untuk 10 folds
mean_metric_poly_eropa <- colMeans(metric_poly_eropa)

mean_metric_poly_eropa
}
)

best_poly_eropa <- cbind(order,best_poly_eropa)

#berdasarkan rmse
datatable(best_poly_eropa %>% slice_min(rmse))

Piecewise Constant

set.seed(123)

cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:9
best_tangga_eropa <- map_dfr(breaks, function(i){

metric_tangga_eropa <- map_dfr(cross_val$splits,
    function(x){

training <- dataEropa[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataEropa[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_eropa

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_eropa <- colMeans(metric_tangga_eropa)

mean_metric_tangga_eropa
})
best_tangga_eropa <- cbind(breaks=breaks,best_tangga_eropa)


#berdasarkan rmse
datatable(best_tangga_eropa %>% slice_min(rmse))

Natural Cubic Spline

set.seed(123)
cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:10

best_spline_eropa <- map_dfr(df, function(i){
metric_spline_eropa <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=dataEropa[x$in_id,])
pred <- predict(mod,
               newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_eropa

# menghitung rata-rata untuk 10 folds
mean_metric_spline_eropa <- colMeans(metric_spline_eropa)

mean_metric_spline_eropa
}
)

best_spline_eropa <- cbind(df=df,best_spline_eropa)

#berdasarkan rmse
datatable(best_spline_eropa %>% slice_min(rmse))

Model Terbaik - Data Eropa

Berikut adalah model-model terbaik dari hasil CV untuk data Eropa

ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
  
  geom_point(alpha=0.55, color="coral") +
  
  stat_smooth(aes(colour="Polinomial d=2"),method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Fungsi Tangga k=7"),method = "lm", 
               formula = y~cut(x,7), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Natural Cubic Spline df=2"),method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,se = F) +
  
  
  theme_bw() +
  labs(title="Perbandingan Model Terbaik - Data Eropa") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 

nilai_metric_eropa <- rbind(best_poly_eropa %>% select(-1) %>% slice_min(rmse),
                      best_tangga_eropa %>% select(-1) %>% slice_min(rmse),
                      best_spline_eropa %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Reg Polinomial (degree=2)",
"Pieceswise Constant (knot=7)",
                "Natural Cubic Splines (df=2)"
)

best_model_eropa <- data.frame(nama_model,nilai_metric_eropa)

datatable(best_model_eropa)
#berdasarkan rmse
datatable(best_model_eropa %>% slice_min(rmse))

Dari output diatas dapat diambil kesimpulan bahwa :

  • Jika menggunakan model polinomial, degree optimal ada pada degree=2

  • Jika menggunakan model piecewise constant, knot optimal ada pada knot=7

  • Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=1

Dan model terbaik dari ketiganya untuk data Eropa adalah dengan Model Natural Cubic Splines dengan df 1

Model Final - Data Eropa

attr(ns(dataEropa$horsepower,df=2), "knots")
##  50% 
## 76.5
ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,se = F) +
  ggtitle("Natural Cubic Splines (df=2) - Data Eropa") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") + 
  theme(plot.title = element_text(hjust = 0.5)) 

Data Jepang

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
order <- 2:4

best_poly_jepang <- map_dfr(order, function(i){
metric_poly_jepang <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,i),
         data=dataJepang[x$in_id,])
pred <- predict(mod,
               newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly_jepang

# menghitung rata-rata untuk 10 folds
mean_metric_poly_jepang <- colMeans(metric_poly_jepang)

mean_metric_poly_jepang
}
)

best_poly_jepang <- cbind(order,best_poly_jepang)

#berdasarkan rmse
datatable(best_poly_jepang %>% slice_min(rmse))

Piecewise Constant

set.seed(123)

cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:5

best_tangga_jepang <- map_dfr(breaks, function(i){

metric_tangga_jepang <- map_dfr(cross_val$splits,
    function(x){

training <- dataJepang[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataJepang[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_jepang

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_jepang <- colMeans(metric_tangga_jepang)

mean_metric_tangga_jepang
})

best_tangga_jepang <- cbind(breaks=breaks,best_tangga_jepang)


#berdasarkan rmse
datatable(best_tangga_jepang %>% slice_min(rmse))

Natural Cubic Spline

set.seed(123)
cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:10

best_spline_jepang <- map_dfr(df, function(i){
metric_spline_jepang <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=dataJepang[x$in_id,])
pred <- predict(mod,
               newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_jepang

# menghitung rata-rata untuk 10 folds
mean_metric_spline_jepang <- colMeans(metric_spline_jepang)

mean_metric_spline_jepang
}
)

best_spline_jepang <- cbind(df=df,best_spline_jepang)

#berdasarkan rmse
datatable(best_spline_jepang %>% slice_min(rmse))

Model Terbaik - Data Jepang

Berikut adalah model-model terbaik dari hasil CV untuk data Amerika

ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
  
  geom_point(alpha=0.55, color="coral") +
  
  stat_smooth(aes(colour="Polinomial d=3"),method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Fungsi Tangga k=4"),method = "lm", 
               formula = y~cut(x,4), 
               lty = 1,se = F)+
  
  stat_smooth(aes(colour="Natural Cubic Spline df=3"),method = "lm", 
               formula = y~ns(x, df=3), 
               lty = 1,se = F) +
  
  
  theme_bw() +
  labs(title="Perbandingan Model Terbaik - Jepang") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 

nilai_metric_jepang <- rbind(best_poly_jepang %>% select(-1) %>% slice_min(rmse),
                      best_tangga_jepang %>% select(-1) %>% slice_min(rmse),
                      best_spline_jepang %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Reg Polinomial (degree=3)",
"Piecewise Constant (knot=4)",
                "Natural Cubic Splines (df=3)"
)

best_model_jepang <- data.frame(nama_model,nilai_metric_jepang)

datatable(best_model_jepang)
#berdasarkan rmse
datatable(best_model_jepang %>% slice_min(rmse))

Dari output diatas dapat diambil kesimpulan bahwa :

  • Jika menggunakan model polinomial, degree optimal ada pada degree=3

  • Jika menggunakan model piecewise constant, knot optimal ada pada knot=4

  • Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=3

Dan model terbaik dari ketiganya untuk data Jepang adalah dengan Model Polynomial Regression dengan degree 3

Model Final - Data Jepang

 ggplot(dataJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  theme_bw() +
  ggtitle("Regresi Polynomial (degree=3) - Data Jepang") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") + 
  theme(plot.title = element_text(hjust = 0.5))

Model Terbaik

Sehingga, jika di plot model terbaik untuk masing-masing data adalah :

#plot 1 : Auto
plot1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,se = F) +
  ggtitle("Auto - Natural Cubic Splines (df=10)") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 


#plot 2 : Amerika
plot2 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,11), 
               lty = 1, col = "blue",se = F)+
  theme_bw() +
  ggtitle("Amerika - Piecewise Constant (K=11)") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 


#plot 3 : Eropa
plot3 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,se = F) +
  ggtitle("Eropa - Natural Cubic Splines (df=2)") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 


#plot 4 : Jepang
plot4 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "red",se = F)+
  theme_bw() +
  ggtitle("Jepang - Polynomial Regression (d=3)") +
  ylab("Miles per Gallon") +
  xlab("Horsepower") 

grid.arrange(plot1, plot2, plot3, plot4, nrow=2, ncol=2, top = textGrob("Model Terbaik",gp=gpar(fontsize=20,font=3)))