Tugas Praktikum STA 581 Pertemuan 8

Pada tugas ini dilakukan prediksi peubah respon mpg dari data Auto menggunakan model-model non-linear. Data Auto diambil dari package ISLR. Peubah prediktor akan dipilih dari tiga kolom lainnya pada data Auto tersebut. Model yang dibuat dibatasi hanya memiliki satu peubah prediktor, sehingga model terbaik yang dicari adalah untuk setiap pasang.

library(ISLR)
library(caret)
library(splines)
library(rsample)
library(purrr)
library(dplyr)
library(gridExtra)

Peubah prediktor yang akan digunakan untuk memprediksi peubah respon mpg adalah :

  • horsepower
  • displacement
  • weight

Eksplorasi Data

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  geom_smooth(se = F) +
  ggtitle("MPG & Horsepower") +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  geom_smooth(se = F) +
  ggtitle("MPG & Displacement") +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  geom_smooth(se = F) +
  ggtitle("MPG & Weight") +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Dari hasil plot ketiga peubah prediktor dengan peubah respon mpg dapat dilihat bahwa masing-masing pasangan memiliki hubungan non-linear.

Model Terbaik

Pendugaan mpg akan dilakukan dengan model non liniear, pada kasus ini metode yang akan digunakan adalah :

  • Regresi Polinomial
  • Regresi Tangga
  • Natural Cubic Splines
  • LOESS (locally estimated scatterplot smoothing)

Model terbaik untuk setiap pasangan akan dievaluasi secara empiris dan visual. Evaluasi Secara empiris akan dilakukan dengan cara validasi silang untuk beberapa parameter pada setiap metode, kemudian dipilih model terbaik untuk setiap metode yaitu model yang memberikan nilai RMSE terkecil. Kemudian model terbaik setiap metode dibuat plotnya dan dievaluasi secara visual untuk menentukan mana model yang terbaik.

MPG & Horsepower

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

d_poly <- 1:10

best_poly <- map_dfr(d_poly, function(i){
    metric_poly <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ poly(horsepower,i,raw = T),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_poly
    # menghitung rata-rata untuk 10 folds
    mean_metric_poly <- colMeans(metric_poly)
    mean_metric_poly
  }
)
best_poly_fin <- cbind(d=d_poly,best_poly)
rp <- best_poly_fin %>% slice_min(rmse)
best_poly_fin$lab <- paste(best_poly_fin$d,best_poly_fin$rmse, sep="\n")

ggplot(best_poly_fin,aes(x=d, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_poly_fin[which.min(best_poly_fin$rmse), ], color="red",size=2) +
  geom_text(data = best_poly_fin[which.min(best_poly_fin$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Regresi Polinomial") +
  xlab("d") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 7, karena memiliki RMSE yang paling kecil.

Regresi Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:21

best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
      testing <- Auto[-x$in_id,]
      horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
    }
  )

metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(knots=breaks-1,best_tangga)
pc <- best_tangga %>% slice_min(rmse)
best_tangga$lab <- paste(best_tangga$knots,best_tangga$rmse, sep="\n")

ggplot(best_tangga,aes(x=knots, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_tangga[which.min(best_tangga$rmse), ], color="red",size=2) +
  geom_text(data = best_tangga[which.min(best_tangga$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Piecewise Constant") +
  xlab("Knots") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 14, karena memiliki RMSE yang paling kecil.

Natural Cubic Splines

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 1:20

best_spline3 <- map_dfr(df, function(i){
    metric_spline3 <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_spline3
    # menghitung rata-rata untuk 10 folds
    mean_metric_spline3 <- colMeans(metric_spline3)
    #mean_metric_spline3
  }
)

best_spline3 <- cbind(df=df,best_spline3)
ncs <- best_spline3 %>% slice_min(rmse)
best_spline3$lab <- paste(best_spline3$df,best_spline3$rmse, sep="\n")

ggplot(best_spline3,aes(x=df, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="red",size=2) +
  geom_text(data = best_spline3[which.min(best_spline3$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
  xlab("Degrees of Freedom") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 10, karena memiliki RMSE yang paling kecil.

LOESS

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(mpg ~ horsepower,span = i,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)

best_loess <- cbind(span=span,best_loess)
loess1 <- best_loess %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loess
best_loess$lab <- paste(best_loess$span,best_loess$rmse, sep="\n")

ggplot(best_loess,aes(x=span, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_loess[which.min(best_loess$rmse), ], color="red",size=2) +
  geom_text(data = best_loess[which.min(best_loess$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi LOESS") +
  xlab("span") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 0.15510204, karena memiliki RMSE yang paling kecil.

Model Terbaik

data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
                ,RMSE = rbind(rp$rmse,pc$rmse,ncs$rmse,loess1$rmse)
                ,Parameter_Optimum = c('d = 7','knots = 14','df = 10','span = 0.15510204'))
rp_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~poly(x,7,raw = T),lty = 1,se=F) +
  ggtitle("Polynomial Regression (d=7)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

rt_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~cut(x,15),lty = 1,se=F) +
  ggtitle("Piecewise Constant (knots=14)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

ncs_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~ns(x, df = 10),lty = 1,se=F) +
  ggtitle("Natural Cubic Splines (df=10)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

loess1_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "loess",formula = y~x,span = 0.15510204,lty = 1,se=F) +
  ggtitle("LOESS (span=0.15510204)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

grid.arrange(rp_plot,rt_plot,ncs_plot,loess1_plot, nrow = 2)

Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara mpg dengan horsepower yaitu semakin besar horsepower maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah horsepower adalah metode Natural Cubic Splines dengan 10 derajat bebas.

MPG & Displacement

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

d_poly <- 1:10

best_poly_disp <- map_dfr(d_poly, function(i){
    metric_poly <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ poly(displacement,i,raw = T),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_poly
    # menghitung rata-rata untuk 10 folds
    mean_metric_poly <- colMeans(metric_poly)
    mean_metric_poly
  }
)
best_poly_fin_disp <- cbind(d=d_poly,best_poly_disp)
rp_disp <- best_poly_fin_disp %>% slice_min(rmse)
best_poly_fin_disp$lab <- paste(best_poly_fin_disp$d,best_poly_fin_disp$rmse, sep="\n")

ggplot(best_poly_fin_disp,aes(x=d, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_poly_fin_disp[which.min(best_poly_fin_disp$rmse), ], color="red",size=2) +
  geom_text(data = best_poly_fin_disp[which.min(best_poly_fin_disp$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Regresi Polinomial") +
  xlab("d") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 10, karena memiliki RMSE yang paling kecil.

Regresi Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:14

best_tangga_disp <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto[x$in_id,]
      training$displacement <- cut(training$displacement,i)
      mod <- lm(mpg ~ displacement,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
      testing <- Auto[-x$in_id,]
      displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(displacement=displacement_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
    }
  )

metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga_disp <- cbind(knots=breaks-1,best_tangga_disp)
pc_disp <- best_tangga_disp %>% slice_min(rmse)
best_tangga_disp$lab <- paste(best_tangga_disp$knots,best_tangga_disp$rmse, sep="\n")

ggplot(best_tangga_disp,aes(x=knots, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_tangga_disp[which.min(best_tangga_disp$rmse), ], color="red",size=2) +
  geom_text(data = best_tangga_disp[which.min(best_tangga_disp$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Piecewise Constant") +
  xlab("Knots") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 9, karena memiliki RMSE yang paling kecil.

Natural Cubic Splines

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 1:20

best_spline3_disp <- map_dfr(df, function(i){
    metric_spline3 <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_spline3
    # menghitung rata-rata untuk 10 folds
    mean_metric_spline3 <- colMeans(metric_spline3)
    #mean_metric_spline3
  }
)

best_spline3_disp <- cbind(df=df,best_spline3_disp)
ncs_disp <- best_spline3_disp %>% slice_min(rmse)
best_spline3_disp$lab <- paste(best_spline3_disp$df,best_spline3_disp$rmse, sep="\n")

ggplot(best_spline3_disp,aes(x=df, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_spline3_disp[which.min(best_spline3_disp$rmse), ], color="red",size=2) +
  geom_text(data = best_spline3_disp[which.min(best_spline3_disp$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
  xlab("Degrees of Freedom") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 12, karena memiliki RMSE yang paling kecil.

LOESS

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

span <- seq(0.1,1,length.out=50)

best_loess_disp <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(mpg ~ displacement,span = i,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)

best_loess_disp <- cbind(span=span,best_loess_disp)
loess1_disp <- best_loess_disp %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loess_disp
best_loess_disp$lab <- paste(best_loess_disp$span,best_loess_disp$rmse, sep="\n")

ggplot(best_loess_disp,aes(x=span, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_loess_disp[which.min(best_loess_disp$rmse), ], color="red",size=2) +
  geom_text(data = best_loess_disp[which.min(best_loess_disp$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi LOESS") +
  xlab("span") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 0.11836734, karena memiliki RMSE yang paling kecil.

Model Terbaik

data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
                ,RMSE = rbind(rp_disp$rmse,pc_disp$rmse,ncs_disp$rmse,loess1_disp$rmse)
                ,Parameter_Optimum = c('d = 10','knots = 9','df = 12','span = 0.11836734'))
rp_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~poly(x,10,raw = T),lty = 1,se=F) +
  ggtitle("Polynomial Regression (d=10)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

rt_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~cut(x,10),lty = 1,se=F) +
  ggtitle("Piecewise Constant (knots=9)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

ncs_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~ns(x, df = 12),lty = 1,se=F) +
  ggtitle("Natural Cubic Splines (df=12)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

loess1_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "loess",formula = y~x,span = 0.11836734,lty = 1,se=F) +
  ggtitle("LOESS (span=0.11836734)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))


grid.arrange(rp_disp_plot,rt_disp_plot,ncs_disp_plot,loess1_disp_plot, nrow = 2)

Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara mpg dengan displacement yaitu semakin besar displacement maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah displacement adalah metode Natural Cubic Splines dengan 12 derajat bebas.

MPG & Weight

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

d_poly <- 1:10

best_poly_weight <- map_dfr(d_poly, function(i){
    metric_poly <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ poly(weight,i,raw = T),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_poly
    # menghitung rata-rata untuk 10 folds
    mean_metric_poly <- colMeans(metric_poly)
    mean_metric_poly
  }
)
best_poly_fin_weight <- cbind(d=d_poly,best_poly_weight)
rp_weight <- best_poly_fin_weight %>% slice_min(rmse)
best_poly_fin_weight$lab <- paste(best_poly_fin_weight$d,best_poly_fin_weight$rmse, sep="\n")

ggplot(best_poly_fin_weight,aes(x=d, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_poly_fin_weight[which.min(best_poly_fin_weight$rmse), ], color="red",size=2) +
  geom_text(data = best_poly_fin_weight[which.min(best_poly_fin_weight$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Regresi Polinomial") +
  xlab("d") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 2, karena memiliki RMSE yang paling kecil.

Regresi Fungsi Tangga (Piecewise Constant)

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:17

best_tangga_weight <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto[x$in_id,]
      training$weight <- cut(training$weight,i)
      mod <- lm(mpg ~ weight,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                      upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
      testing <- Auto[-x$in_id,]
      weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(weight=weight_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
    }
  )

metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga_weight <- cbind(knots=breaks-1,best_tangga_weight)
pc_weight <- best_tangga_weight %>% slice_min(rmse)
best_tangga_weight$lab <- paste(best_tangga_weight$knots,best_tangga_weight$rmse, sep="\n")

ggplot(best_tangga_weight,aes(x=knots, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_tangga_weight[which.min(best_tangga_weight$rmse), ], color="red",size=2) +
  geom_text(data = best_tangga_weight[which.min(best_tangga_weight$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Piecewise Constant") +
  xlab("Knots") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 11, karena memiliki RMSE yang paling kecil.

Natural Cubic Splines

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 1:20

best_spline3_weight <- map_dfr(df, function(i){
    metric_spline3 <- map_dfr(cross_val$splits,function(x){
        mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
        pred <- predict(mod,newdata=Auto[-x$in_id,])
        truth <- Auto[-x$in_id,]$mpg
        rmse <- mlr3measures::rmse(truth = truth,response = pred)
        mae <- mlr3measures::mae(truth = truth,response = pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
    metric_spline3
    # menghitung rata-rata untuk 10 folds
    mean_metric_spline3 <- colMeans(metric_spline3)
    #mean_metric_spline3
  }
)

best_spline3_weight <- cbind(df=df,best_spline3_weight)
ncs_weight <- best_spline3_weight %>% slice_min(rmse)
best_spline3_weight$lab <- paste(best_spline3_weight$df,best_spline3_weight$rmse, sep="\n")

ggplot(best_spline3_weight,aes(x=df, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_spline3_weight[which.min(best_spline3_weight$rmse), ], color="red",size=2) +
  geom_text(data = best_spline3_weight[which.min(best_spline3_weight$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
  xlab("Degrees of Freedom") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 7, karena memiliki RMSE yang paling kecil.

LOESS

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

span <- seq(0.1,1,length.out=50)

best_loess_weight <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(mpg ~ weight,span = i,
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)

best_loess_weight <- cbind(span=span,best_loess_weight)
loess1_weight <- best_loess_weight %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loess_weight
best_loess_weight$lab <- paste(best_loess_weight$span,best_loess_weight$rmse, sep="\n")

ggplot(best_loess_weight,aes(x=span, y=rmse)) +
  geom_point(color="black", size=2) +
  geom_point(data = best_loess_weight[which.min(best_loess_weight$rmse), ], color="red",size=2) +
  geom_text(data = best_loess_weight[which.min(best_loess_weight$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
  ggtitle("Hasil Validasi LOESS") +
  xlab("span") + 
  ylab("RMSE") +
  theme(plot.title = element_text(hjust = 0.5))

Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 1, karena memiliki RMSE yang paling kecil.

Model Terbaik

data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
                ,RMSE = rbind(rp_weight$rmse,pc_weight$rmse,ncs_weight$rmse,loess1_weight$rmse)
                ,Parameter_Optimum = c('d = 2','knots = 11','df = 7','span = 1'))
rp_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~poly(x,2,raw = T),lty = 1,se=F) +
  ggtitle("Polynomial Regression (d=2)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

rt_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~cut(x,12),lty = 1,se=F) +
  ggtitle("Piecewise Constant (knots=11)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

ncs_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm",formula = y~ns(x, df = 7),lty = 1,se=F) +
  ggtitle("Natural Cubic Splines (df=7)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

loess1_disp_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "loess",formula = y~x,span = 1,lty = 1,se=F) +
  ggtitle("LOESS (span=1)") +
  theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))

grid.arrange(rp_weight_plot,rt_weight_plot,ncs_weight_plot,loess1_disp_plot, nrow = 2)

Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara mpg dengan weight yaitu semakin besar weight maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah weight adalah metode Natural Cubic Splines dengan 7 derajat bebas.