Tugas Praktikum STA 581 Pertemuan 8
Pada tugas ini dilakukan prediksi peubah respon mpg dari data Auto menggunakan model-model non-linear. Data Auto diambil dari package ISLR. Peubah prediktor akan dipilih dari tiga kolom lainnya pada data Auto tersebut. Model yang dibuat dibatasi hanya memiliki satu peubah prediktor, sehingga model terbaik yang dicari adalah untuk setiap pasang.
library(ISLR)
library(caret)
library(splines)
library(rsample)
library(purrr)
library(dplyr)
library(gridExtra)Peubah prediktor yang akan digunakan untuk memprediksi peubah respon mpg adalah :
horsepowerdisplacementweight
Eksplorasi Data
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
geom_smooth(se = F) +
ggtitle("MPG & Horsepower") +
theme(plot.title = element_text(hjust = 0.5))## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
geom_smooth(se = F) +
ggtitle("MPG & Displacement") +
theme(plot.title = element_text(hjust = 0.5))## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
geom_smooth(se = F) +
ggtitle("MPG & Weight") +
theme(plot.title = element_text(hjust = 0.5))## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Dari hasil plot ketiga peubah prediktor dengan peubah respon mpg dapat dilihat bahwa masing-masing pasangan memiliki hubungan non-linear.
Model Terbaik
Pendugaan mpg akan dilakukan dengan model non liniear, pada kasus ini metode yang akan digunakan adalah :
- Regresi Polinomial
- Regresi Tangga
- Natural Cubic Splines
- LOESS (locally estimated scatterplot smoothing)
Model terbaik untuk setiap pasangan akan dievaluasi secara empiris dan visual. Evaluasi Secara empiris akan dilakukan dengan cara validasi silang untuk beberapa parameter pada setiap metode, kemudian dipilih model terbaik untuk setiap metode yaitu model yang memberikan nilai RMSE terkecil. Kemudian model terbaik setiap metode dibuat plotnya dan dievaluasi secara visual untuk menentukan mana model yang terbaik.
MPG & Horsepower
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d_poly <- 1:10
best_poly <- map_dfr(d_poly, function(i){
metric_poly <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,i,raw = T),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly_fin <- cbind(d=d_poly,best_poly)
rp <- best_poly_fin %>% slice_min(rmse)best_poly_fin$lab <- paste(best_poly_fin$d,best_poly_fin$rmse, sep="\n")
ggplot(best_poly_fin,aes(x=d, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_poly_fin[which.min(best_poly_fin$rmse), ], color="red",size=2) +
geom_text(data = best_poly_fin[which.min(best_poly_fin$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Regresi Polinomial") +
xlab("d") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 7, karena memiliki RMSE yang paling kecil.
Regresi Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:21
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(knots=breaks-1,best_tangga)
pc <- best_tangga %>% slice_min(rmse)best_tangga$lab <- paste(best_tangga$knots,best_tangga$rmse, sep="\n")
ggplot(best_tangga,aes(x=knots, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_tangga[which.min(best_tangga$rmse), ], color="red",size=2) +
geom_text(data = best_tangga[which.min(best_tangga$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Piecewise Constant") +
xlab("Knots") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 14, karena memiliki RMSE yang paling kecil.
Natural Cubic Splines
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:20
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
#mean_metric_spline3
}
)
best_spline3 <- cbind(df=df,best_spline3)
ncs <- best_spline3 %>% slice_min(rmse)best_spline3$lab <- paste(best_spline3$df,best_spline3$rmse, sep="\n")
ggplot(best_spline3,aes(x=df, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="red",size=2) +
geom_text(data = best_spline3[which.min(best_spline3$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
xlab("Degrees of Freedom") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5)) Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 10, karena memiliki RMSE yang paling kecil.
LOESS
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
span <- seq(0.1,1,length.out=50)
best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
function(x){
mod <- loess(mpg ~ horsepower,span = i,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
data_eval <- na.omit(data.frame(pred=pred,
truth=truth))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
head(metric_loess,20)
# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)
mean_metric_loess
}
)
best_loess <- cbind(span=span,best_loess)
loess1 <- best_loess %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loessbest_loess$lab <- paste(best_loess$span,best_loess$rmse, sep="\n")
ggplot(best_loess,aes(x=span, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_loess[which.min(best_loess$rmse), ], color="red",size=2) +
geom_text(data = best_loess[which.min(best_loess$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi LOESS") +
xlab("span") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 0.15510204, karena memiliki RMSE yang paling kecil.
Model Terbaik
data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
,RMSE = rbind(rp$rmse,pc$rmse,ncs$rmse,loess1$rmse)
,Parameter_Optimum = c('d = 7','knots = 14','df = 10','span = 0.15510204'))rp_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~poly(x,7,raw = T),lty = 1,se=F) +
ggtitle("Polynomial Regression (d=7)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
rt_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~cut(x,15),lty = 1,se=F) +
ggtitle("Piecewise Constant (knots=14)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
ncs_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~ns(x, df = 10),lty = 1,se=F) +
ggtitle("Natural Cubic Splines (df=10)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
loess1_plot <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "loess",formula = y~x,span = 0.15510204,lty = 1,se=F) +
ggtitle("LOESS (span=0.15510204)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
grid.arrange(rp_plot,rt_plot,ncs_plot,loess1_plot, nrow = 2)Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara mpg dengan horsepower yaitu semakin besar horsepower maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah horsepower adalah metode Natural Cubic Splines dengan 10 derajat bebas.
MPG & Displacement
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d_poly <- 1:10
best_poly_disp <- map_dfr(d_poly, function(i){
metric_poly <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,i,raw = T),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly_fin_disp <- cbind(d=d_poly,best_poly_disp)
rp_disp <- best_poly_fin_disp %>% slice_min(rmse)best_poly_fin_disp$lab <- paste(best_poly_fin_disp$d,best_poly_fin_disp$rmse, sep="\n")
ggplot(best_poly_fin_disp,aes(x=d, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_poly_fin_disp[which.min(best_poly_fin_disp$rmse), ], color="red",size=2) +
geom_text(data = best_poly_fin_disp[which.min(best_poly_fin_disp$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Regresi Polinomial") +
xlab("d") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 10, karena memiliki RMSE yang paling kecil.
Regresi Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:14
best_tangga_disp <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga_disp <- cbind(knots=breaks-1,best_tangga_disp)
pc_disp <- best_tangga_disp %>% slice_min(rmse)best_tangga_disp$lab <- paste(best_tangga_disp$knots,best_tangga_disp$rmse, sep="\n")
ggplot(best_tangga_disp,aes(x=knots, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_tangga_disp[which.min(best_tangga_disp$rmse), ], color="red",size=2) +
geom_text(data = best_tangga_disp[which.min(best_tangga_disp$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Piecewise Constant") +
xlab("Knots") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 9, karena memiliki RMSE yang paling kecil.
Natural Cubic Splines
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:20
best_spline3_disp <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
#mean_metric_spline3
}
)
best_spline3_disp <- cbind(df=df,best_spline3_disp)
ncs_disp <- best_spline3_disp %>% slice_min(rmse)best_spline3_disp$lab <- paste(best_spline3_disp$df,best_spline3_disp$rmse, sep="\n")
ggplot(best_spline3_disp,aes(x=df, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_spline3_disp[which.min(best_spline3_disp$rmse), ], color="red",size=2) +
geom_text(data = best_spline3_disp[which.min(best_spline3_disp$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
xlab("Degrees of Freedom") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5)) Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 12, karena memiliki RMSE yang paling kecil.
LOESS
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
span <- seq(0.1,1,length.out=50)
best_loess_disp <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
function(x){
mod <- loess(mpg ~ displacement,span = i,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
data_eval <- na.omit(data.frame(pred=pred,
truth=truth))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
head(metric_loess,20)
# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)
mean_metric_loess
}
)
best_loess_disp <- cbind(span=span,best_loess_disp)
loess1_disp <- best_loess_disp %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loess_dispbest_loess_disp$lab <- paste(best_loess_disp$span,best_loess_disp$rmse, sep="\n")
ggplot(best_loess_disp,aes(x=span, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_loess_disp[which.min(best_loess_disp$rmse), ], color="red",size=2) +
geom_text(data = best_loess_disp[which.min(best_loess_disp$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi LOESS") +
xlab("span") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 0.11836734, karena memiliki RMSE yang paling kecil.
Model Terbaik
data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
,RMSE = rbind(rp_disp$rmse,pc_disp$rmse,ncs_disp$rmse,loess1_disp$rmse)
,Parameter_Optimum = c('d = 10','knots = 9','df = 12','span = 0.11836734'))rp_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~poly(x,10,raw = T),lty = 1,se=F) +
ggtitle("Polynomial Regression (d=10)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
rt_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~cut(x,10),lty = 1,se=F) +
ggtitle("Piecewise Constant (knots=9)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
ncs_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~ns(x, df = 12),lty = 1,se=F) +
ggtitle("Natural Cubic Splines (df=12)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
loess1_disp_plot <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "loess",formula = y~x,span = 0.11836734,lty = 1,se=F) +
ggtitle("LOESS (span=0.11836734)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
grid.arrange(rp_disp_plot,rt_disp_plot,ncs_disp_plot,loess1_disp_plot, nrow = 2) Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara
mpg dengan displacement yaitu semakin besar displacement maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah displacement adalah metode Natural Cubic Splines dengan 12 derajat bebas.
MPG & Weight
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d_poly <- 1:10
best_poly_weight <- map_dfr(d_poly, function(i){
metric_poly <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,i,raw = T),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly_fin_weight <- cbind(d=d_poly,best_poly_weight)
rp_weight <- best_poly_fin_weight %>% slice_min(rmse)best_poly_fin_weight$lab <- paste(best_poly_fin_weight$d,best_poly_fin_weight$rmse, sep="\n")
ggplot(best_poly_fin_weight,aes(x=d, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_poly_fin_weight[which.min(best_poly_fin_weight$rmse), ], color="red",size=2) +
geom_text(data = best_poly_fin_weight[which.min(best_poly_fin_weight$rmse), ], aes(d+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Regresi Polinomial") +
xlab("d") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan untuk model terbaik dengan regresi polinomial yaitu dengan derajat 2, karena memiliki RMSE yang paling kecil.
Regresi Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:17
best_tangga_weight <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga_weight <- cbind(knots=breaks-1,best_tangga_weight)
pc_weight <- best_tangga_weight %>% slice_min(rmse)best_tangga_weight$lab <- paste(best_tangga_weight$knots,best_tangga_weight$rmse, sep="\n")
ggplot(best_tangga_weight,aes(x=knots, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_tangga_weight[which.min(best_tangga_weight$rmse), ], color="red",size=2) +
geom_text(data = best_tangga_weight[which.min(best_tangga_weight$rmse), ], aes(knots+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Piecewise Constant") +
xlab("Knots") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk Regresi Tangga yaitu dengan jumlah knots sebanyak 11, karena memiliki RMSE yang paling kecil.
Natural Cubic Splines
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:20
best_spline3_weight <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
#mean_metric_spline3
}
)
best_spline3_weight <- cbind(df=df,best_spline3_weight)
ncs_weight <- best_spline3_weight %>% slice_min(rmse)best_spline3_weight$lab <- paste(best_spline3_weight$df,best_spline3_weight$rmse, sep="\n")
ggplot(best_spline3_weight,aes(x=df, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_spline3_weight[which.min(best_spline3_weight$rmse), ], color="red",size=2) +
geom_text(data = best_spline3_weight[which.min(best_spline3_weight$rmse), ], aes(df+0.25,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi Silang Natural Cubic Splines") +
xlab("Degrees of Freedom") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5)) Dari hasil validasi silang dapat disimpulkan model terbaik untuk Natural Cubic Splines yaitu dengan derajat bebas sebanyak 7, karena memiliki RMSE yang paling kecil.
LOESS
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
span <- seq(0.1,1,length.out=50)
best_loess_weight <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
function(x){
mod <- loess(mpg ~ weight,span = i,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
data_eval <- na.omit(data.frame(pred=pred,
truth=truth))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
head(metric_loess,20)
# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)
mean_metric_loess
}
)
best_loess_weight <- cbind(span=span,best_loess_weight)
loess1_weight <- best_loess_weight %>% slice_min(rmse)
# menampilkan hasil all breaks
#best_loess_weightbest_loess_weight$lab <- paste(best_loess_weight$span,best_loess_weight$rmse, sep="\n")
ggplot(best_loess_weight,aes(x=span, y=rmse)) +
geom_point(color="black", size=2) +
geom_point(data = best_loess_weight[which.min(best_loess_weight$rmse), ], color="red",size=2) +
geom_text(data = best_loess_weight[which.min(best_loess_weight$rmse), ], aes(span+0.1,rmse,label=lab),size=3) +
ggtitle("Hasil Validasi LOESS") +
xlab("span") +
ylab("RMSE") +
theme(plot.title = element_text(hjust = 0.5))Dari hasil validasi silang dapat disimpulkan model terbaik untuk LOESS yaitu dengan span = 1, karena memiliki RMSE yang paling kecil.
Model Terbaik
data.frame(Metode=c('Polynomial Regression','Piecewise Constant','Natural Cubic Splines','LOESS')
,RMSE = rbind(rp_weight$rmse,pc_weight$rmse,ncs_weight$rmse,loess1_weight$rmse)
,Parameter_Optimum = c('d = 2','knots = 11','df = 7','span = 1'))rp_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~poly(x,2,raw = T),lty = 1,se=F) +
ggtitle("Polynomial Regression (d=2)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
rt_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~cut(x,12),lty = 1,se=F) +
ggtitle("Piecewise Constant (knots=11)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
ncs_weight_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",formula = y~ns(x, df = 7),lty = 1,se=F) +
ggtitle("Natural Cubic Splines (df=7)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
loess1_disp_plot <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "loess",formula = y~x,span = 1,lty = 1,se=F) +
ggtitle("LOESS (span=1)") +
theme(legend.position="bottom",plot.title = element_text(hjust = 0.5))
grid.arrange(rp_weight_plot,rt_weight_plot,ncs_weight_plot,loess1_disp_plot, nrow = 2) Baik secara empiris (memiliki nilai RMSE terkecil) maupun secara visual (garis plot sesuai dengan bentuk scatterplot data dan logika hubungan antara
mpg dengan weight yaitu semakin besar weight maka akan semakin kecil nilai mpg), model non linear terbaik untuk pendugaan peubah respon mpg dengan peubah weight adalah metode Natural Cubic Splines dengan 7 derajat bebas.