Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.
Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)
library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(DT)
library(cowplot)
Data yang digunakan adalah data Auto dari package ISLR, terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang akan digunakan kali ini hanya 4, yaitu:
mpg (miles per gallon)
displacement
weight
horsepower
Auto = Auto %>% select(mpg, displacement, weight, horsepower)
datatable(Auto)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan displacement sebagai peubah prediktor.
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "blue",se = F)+
theme_bw()
Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan horsepower sebagai peubah prediktor.
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "blue",se = F)+
theme_bw()
Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Scatter plot ini diukur berdasarkan miles per gallon sebagai peubah respon dan weight sebagai peubah prediktor.
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "blue",se = F)+
theme_bw()
Jika kita amati scatter plotnya, pola data ini cenderung tidak linear.
Dalam ketiga kasus ini, akan digunakan 3 pendekatan yaitu regresi polinomial, fungsi tangga (piecewise constant), dan natural cubic splines.
Polynomial
set.seed(1233)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:8
best_poly <- map_dfr(degree,function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,i,raw = T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly<- cbind(degree=degree,best_poly)
best_poly
## degree rmse mae
## 1 1 4.637488 3.527734
## 2 2 4.350811 3.177790
## 3 3 4.349441 3.183971
## 4 4 4.355753 3.183893
## 5 5 4.359009 3.191090
## 6 6 4.331887 3.211607
## 7 7 4.281216 3.211554
## 8 8 4.231226 3.210366
#berdasarkan rmse
best_poly %>% slice_min(rmse)
## degree rmse mae
## 1 8 4.231226 3.210366
#berdasarkan mae
best_poly %>% slice_min(mae)
## degree rmse mae
## 1 2 4.350811 3.17779
Secara empiris, jika dilihat dari rmse terlihat bahwa polinomial ordo 8 memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, polinomial ordo 2 memberikan nilai mae terkecil.
#degree=8
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,8,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 8")+
theme_bw()
Pada scatter plot regresi polinomial ordo 8, ujung-ujung regresinya liar.
#degree=2
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
theme_bw()
Piecewise Constant
set.seed(12433)
cross_val <- vfold_cv(Auto,v=10)
breaks <- 2:13
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
mod$model
labs_x
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
labs_x_breaks
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 2 5.950709 4.662388
## 2 3 4.927452 3.703130
## 3 4 4.829414 3.635500
## 4 5 4.676728 3.506854
## 5 6 4.612150 3.412363
## 6 7 4.577732 3.369414
## 7 8 4.380313 3.245803
## 8 9 4.217946 3.095100
## 9 10 4.259715 3.139944
## 10 11 4.332656 3.201723
## 11 12 4.416357 3.280542
## 12 13 4.447195 3.305765
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 9 4.217946 3.0951
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 9 4.217946 3.0951
Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi fungsi tangga dengan 8 knots memberikan nilai rmse dan mae terkecil.
#breaks 9
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 8 knots") +
theme_bw()
Natural Cubic Splines
set.seed(123321)
cross_val <- vfold_cv(Auto,v=10)
df <- 1:16
best_spline <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)
best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
## df rmse mae
## 1 1 4.635647 3.525799
## 2 2 4.353718 3.182011
## 3 3 4.360948 3.188627
## 4 4 4.380593 3.204923
## 5 5 4.327287 3.206944
## 6 6 4.242236 3.138420
## 7 7 4.212878 3.129519
## 8 8 4.206441 3.138094
## 9 9 4.185692 3.141431
## 10 10 4.169330 3.129803
## 11 11 4.116343 3.082791
## 12 12 4.093939 3.052332
## 13 13 4.110197 3.063677
## 14 14 4.131419 3.085322
## 15 15 4.132794 3.082297
## 16 16 4.103554 3.064158
#berdasarkan rmse
best_spline %>% slice_min(rmse)
## df rmse mae
## 1 12 4.093939 3.052332
#berdasarkan mae
best_spline %>% slice_min(mae)
## df rmse mae
## 1 12 4.093939 3.052332
Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa natural cubic splines dengan 11 knots memberikan nilai rmse dan mae terkecil.
#df=12
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,se=F)+
theme_bw()
Pada scatter plot regresi natural cubic splines dengan 11 knots, ujung-ujung regresinya liar.
poly8<-ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,8,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 8")+
theme_bw()
poly2<-ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
theme_bw()
pc8<-ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 8 knots") +
theme_bw()
nc11<-ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,se=F)+
theme_bw()
plot_grid(poly8,poly2,pc8,nc11,labels=c("poly8","poly2","pc8","nc11"), label_size = 6)
Secara rmse dan mae, natural cubic splines dengan 11 knots memiliki nilai terkecil. Tetapi pada ujung-ujungnya liar, maka secara visual regresi polinomial ordo 2 lebih baik.
Polynomial
#POLY#
set.seed(12333)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:8
best_poly <- map_dfr(degree,function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,i,raw = T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly<- cbind(degree=degree,best_poly)
best_poly
## degree rmse mae
## 1 1 4.297314 3.291385
## 2 2 4.131395 3.071654
## 3 3 4.136525 3.079694
## 4 4 4.143885 3.084393
## 5 5 4.146502 3.091196
## 6 6 4.157107 3.111192
## 7 7 4.171308 3.127232
## 8 8 4.191974 3.142416
#berdasarkan rmse
best_poly %>% slice_min(rmse)
## degree rmse mae
## 1 2 4.131395 3.071654
#berdasarkan mae
best_poly %>% slice_min(mae)
## degree rmse mae
## 1 2 4.131395 3.071654
Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa polinomial ordo 2 memberikan nilai rmse dan mae terkecil.
#degree=2
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
theme_bw()
Piecewise Constant
#Tangga#
set.seed(32331)
cross_val <- vfold_cv(Auto,v=10)
breaks <- 2:13
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
mod$model
labs_x
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
labs_x_breaks
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 2 5.567536 4.349647
## 2 3 4.839053 3.637560
## 3 4 4.713463 3.636524
## 4 5 4.338959 3.216653
## 5 6 4.211518 3.124298
## 6 7 4.336175 3.192620
## 7 8 4.406843 3.270115
## 8 9 4.306461 3.168009
## 9 10 4.279202 3.155884
## 10 11 4.308636 3.188655
## 11 12 4.168457 3.061116
## 12 13 4.206414 3.145069
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 12 4.168457 3.061116
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 12 4.168457 3.061116
Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi fungsi tangga dengan 11 knots memberikan nilai rmse dan mae terkecil.
#breaks 12
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 11 knots") +
theme_bw()
Natural Cubic Splines
set.seed(12332233)
cross_val <- vfold_cv(Auto,v=10)
df <- 1:16
best_spline <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)
best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
## df rmse mae
## 1 1 4.307677 3.291863
## 2 2 4.137597 3.075156
## 3 3 4.143246 3.076418
## 4 4 4.156762 3.083247
## 5 5 4.167362 3.099487
## 6 6 4.144889 3.093469
## 7 7 4.136167 3.068934
## 8 8 4.147935 3.089412
## 9 9 4.150859 3.078581
## 10 10 4.152071 3.082931
## 11 11 4.148494 3.082637
## 12 12 4.152773 3.092686
## 13 13 4.183303 3.110217
## 14 14 4.165294 3.100449
## 15 15 4.194780 3.119925
## 16 16 4.190865 3.121880
#berdasarkan rmse
best_spline %>% slice_min(rmse)
## df rmse mae
## 1 7 4.136167 3.068934
#berdasarkan mae
best_spline %>% slice_min(mae)
## df rmse mae
## 1 7 4.136167 3.068934
Secara empiris, jika dilihat dari rmse dan mae terlihat bahwa regresi natural cubic splines dengan 6 knots memberikan nilai rmse dan mae terkecil.
#df=7
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 6 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,se=F)+
theme_bw()
Pada scatter plot regresi natural cubic splines dengan 6 knots, ujung-ujung regresinya liar.
poly2<-ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 2")+
theme_bw()
pc11<-ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 11 knots") +
theme_bw()
nc6<-ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 6 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,se=F)+
theme_bw()
plot_grid(poly2,pc11,nc6, labels=c("poly2","pc11","nc6"),label_size = 6)
Jika dilihat dari rmse, polinomial ordo 2 memiliki nilai terkecil. Sedangkan berdasarkan mae, fungsi tangga dengan 11 knots memiliki nilai terkecil.
Polynomial
set.seed(333)
cross_val <- vfold_cv(Auto,v=10)
degree <- 1:10
best_poly <- map_dfr(degree,function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i,raw = T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly<- cbind(degree=degree,best_poly)
best_poly
## degree rmse mae
## 1 1 4.888687 3.848017
## 2 2 4.325017 3.269488
## 3 3 4.323753 3.263298
## 4 4 4.330573 3.278704
## 5 5 4.292058 3.246026
## 6 6 4.300889 3.258456
## 7 7 4.293910 3.237582
## 8 8 4.318637 3.256122
## 9 9 4.330224 3.252814
## 10 10 4.390391 3.287209
#berdasarkan rmse
best_poly %>% slice_min(rmse)
## degree rmse mae
## 1 5 4.292058 3.246026
#berdasarkan mae
best_poly %>% slice_min(mae)
## degree rmse mae
## 1 7 4.29391 3.237582
Secara empiris, jika dilihat dari rmse terlihat bahwa polinomial ordo 5 memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, polinomial ordo 7 memberikan nilai mae terkecil.
#degree=5
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,5,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 5")+
theme_bw()
#degree 7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 7")+
theme_bw()
Piecewise Constant
set.seed(13323)
cross_val <- vfold_cv(Auto,v=10)
breaks <- 2:15
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
mod$model
labs_x
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
labs_x_breaks
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 2 6.137728 4.811516
## 2 3 5.773404 4.540866
## 3 4 4.993003 3.811826
## 4 5 4.688686 3.590960
## 5 6 4.609176 3.541391
## 6 7 4.563583 3.388485
## 7 8 4.496025 3.454513
## 8 9 4.560964 3.493910
## 9 10 4.530724 3.437346
## 10 11 4.519584 3.366985
## 11 12 4.435169 3.329911
## 12 13 4.513995 3.379557
## 13 14 4.433294 3.274231
## 14 15 4.337768 3.294444
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 15 4.337768 3.294444
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 14 4.433294 3.274231
Secara empiris, jika dilihat dari rmse terlihat bahwa regresi fungsi tangga dengan 14 knots memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, regresi fungsi tangga dengan 14 knots memberikan nilai mae terkecil.
#breaks 14
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,14),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 13 knots") +
theme_bw()
#breaks 15
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 14 knots") +
theme_bw()
Natural Cubic Splines
set.seed(3234)
cross_val <- vfold_cv(Auto,v=10)
df <- 1:16
best_spline <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)
best_spline <- cbind(df=df,best_spline)
# menampilkan hasil all breaks
best_spline
## df rmse mae
## 1 1 4.908532 3.861726
## 2 2 4.335138 3.270106
## 3 3 4.370296 3.280222
## 4 4 4.356550 3.272160
## 5 5 4.338281 3.265823
## 6 6 4.325218 3.252178
## 7 7 4.314007 3.234587
## 8 8 4.338375 3.263733
## 9 9 4.326461 3.233684
## 10 10 4.318222 3.218378
## 11 11 4.336301 3.245917
## 12 12 4.317929 3.193365
## 13 13 4.336178 3.233167
## 14 14 4.311247 3.225107
## 15 15 4.330979 3.260707
## 16 16 4.341010 3.269776
#berdasarkan rmse
best_spline %>% slice_min(rmse)
## df rmse mae
## 1 14 4.311247 3.225107
#berdasarkan mae
best_spline %>% slice_min(mae)
## df rmse mae
## 1 12 4.317929 3.193365
Secara empiris, jika dilihat dari rmse terlihat bahwa regresi natural cubic splines dengan 13 knots memberikan nilai rmse terkecil. Sedangkan jika dilihat berdasarkan mae, regresi natural cubic splines dengan 11 knots memberikan nilai mae terkecil.
#df=12
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,se=F)+
theme_bw()
#df=14
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 13 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=14),
lty = 1,se=F)+
theme_bw()
poly5<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,5,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 5")+
theme_bw()
poly7<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "blue",se = F)+ ggtitle("Regresi Polinomial Ordo 7")+
theme_bw()
pc13<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,14),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 13 knots") +
theme_bw()
pc14<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "blue",se = F)+ ggtitle("Fungsi Tangga dengan 14 knots") +
theme_bw()
nc11<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 11 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,se=F)+
theme_bw()
nc13<-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+ ggtitle("Natural Cubic Splines dengan 13 knots") +
stat_smooth(method = "lm",
formula = y~ns(x, df=14),
lty = 1,se=F)+
theme_bw()
plot_grid(poly5,poly7, pc13,pc14,nc11,nc13,labels=c("poly5","poly7", "pc13","pc14","nc11","nc13"), label_size = 6)
Jika dilihat dari nilai rmse, polinomial ordo 5 memiliki nilai terkecil. Sedangkan berdasarkan nilai mae, natural cubic dengan 11 knots memiliki nilai terkecil.