1. Soal
Data yang digunakan diperoleh dari package ISLR. Gunakan model-model non-linear pada data Auto dengan respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non-linear terhadap peubah-peubah yang dipilih? Buatlah visualisasi untuk mendukung jawaban Anda!
Tentukan model non-linear terbaik untuk masing-masing pasangan peubah (secara visual dan/atau secara empiris)
Library
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(caret) # cross-validation## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gridExtra) # combining graphs##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(dplyr)
library(purrr)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gam) # generalized additive models## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
library(mlr3)2. Dataset
Data yang digunakan adalah Auto yang disediakan oleh package ISLR yang terdiri dari 392 baris (observasi) dan 9 kolom (variabel). Pada tugas ini, hanya menggunakan 4 variabel, yaitu :
1. mpg = Miles per gallon
2. horsepower = Engine horsepower
3. weight = Vehicle weight (lbs.)
4. displacement = Engine displacement (cu. inches)
attach(Auto)## The following object is masked from package:ggplot2:
##
## mpg
Auto = Auto %>% select(mpg, horsepower, weight, displacement)
head(Auto)## mpg horsepower weight displacement
## 1 18 130 3504 307
## 2 15 165 3693 350
## 3 18 150 3436 318
## 4 16 150 3433 304
## 5 17 140 3449 302
## 6 15 198 4341 429
3. mpg VS horsepower
Visualisasi Data
plot(Auto$horsepower, Auto$mpg,pch=19, col="#006400",
xlab="Horse Power", ylab="mile per gallon")
Dari visualisasi data, dapat disimpulkan bahwa semakin besar kekuatan mesin (horse power) dari suatu kendaraan, maka semakin pendek jarak yang ditempuh per galon bahan bakarnya. Jika diamati lebih detail dari visualisasi datanya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan. Pada kasus seperti ini dapat digunakan pendekatan Regresi Polinomial, Piecewise COnstant, dan Natural Cubic Splines.
A. Regresi Polinomial
Akan dilakukan cross validation untuk memilih derajat polinomial agar menghasilkan model mpg vs horsepower yang optimal.
set.seed(1234)
polynom <- function(dataset){
best = c()
degree = 1:4
for (i in degree) {
set.seed(1234)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RL <- colMeans(metric_poly) #menghitung rmse dan mae rata-rata untuk 5 folds
best = c(best,RL)
}
Hasil = matrix(best, byrow = T, ncol = 2)
colnames(Hasil) <- c("rmse","mae")
Hasil_Akhir = as.data.frame(cbind(degree,Hasil))
return(Hasil_Akhir)
}
Polynom <- polynom(Auto)
Polynom## degree rmse mae
## 1 1 4.907582 3.843762
## 2 2 4.358548 3.261911
## 3 3 4.356984 3.259236
## 4 4 4.357800 3.271093
Secara empiris regresi polinomial dengan derajat 3 memiliki rmse dan mae terkecil, sehingga akan dipilih regresi polinomial dengan derajat 3
polynom1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
polynom2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
polynom3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
polynom4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
plot_grid(polynom1, polynom2, polynom3, polynom4, labels=c("poly1","poly2", "poly3", "poly4"), label_size = 6)B. Fungsi Tangga (Piecewise Constant)
set.seed(1234)
tangga <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:15
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
base <- rbind(basedonrmse,basedonmae)
return(list(Hasil=best_tangga, Hasil_baik=base))
}
tangga_pc = tangga(Auto)
tangga_pc## $Hasil
## breaks rmse mae
## 1 3 5.788482 4.522081
## 2 4 5.026946 3.778564
## 3 5 4.737723 3.588164
## 4 6 4.728788 3.573371
## 5 7 4.590146 3.411292
## 6 8 4.492694 3.419407
## 7 9 4.582439 3.494722
## 8 10 4.609092 3.476580
## 9 11 4.539173 3.396151
## 10 12 4.552770 3.372860
## 11 13 4.579014 3.436639
## 12 14 4.470310 3.305084
## 13 15 4.337090 3.321418
##
## $Hasil_baik
## breaks rmse mae
## 1 15 4.33709 3.321418
## 2 14 4.47031 3.305084
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 15 sedangkan berdasarkan mae banyaknya interval terbaik adalah 14.
tangga14 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~cut(x,14),
lty = 1, col = "red",se = F)+
theme_bw()
tangga15 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="darkgreen") +
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "red",se = F)+
theme_bw()
plot_grid(tangga14, tangga15, labels = c( "tangga14", "tangga15"), label_size = 10)
Jika dilihat berdasarkan visual, interval 14 memiliki kurva fungsi tangga yang semakin mulus dan membentuk pola yang mengikuti data tetapi terdapat bentuk-bentuk yang menjadi tidak teratur. Jadi, pada metode fungsi tangga atau piecewise constant ini dipilih interval 15.
C. Natural Cubic Spline
set.seed(1234)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
best.spline3## df rmse mae
## 1 2 4.339045 3.248440
## 2 3 4.344116 3.253468
## 3 4 4.317916 3.235385
## 4 5 4.302687 3.220522
## 5 6 4.296303 3.211792
## 6 7 4.272604 3.195794
## 7 8 4.291248 3.219978
## 8 9 4.289420 3.211435
## 9 10 4.248032 3.169629
## 10 11 4.342002 3.236346
## 11 12 4.321401 3.212768
basis## df rmse mae
## 1 10 4.248032 3.169629
## 2 10 4.248032 3.169629
Banyaknya fungsi basis terpilih ada 10.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Sehingga dapat disimpulkan, bahwa Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67, 72, 80, 88, 93.5, 100, 110, 140, dan 157.7.
nc10 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="darkgreen") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7)), col = "red", se = F) +
theme_bw()
nc10D. Perbandingan Model
Secara Empiris
pol <- as.matrix(Polynom[3,])
tang <- as.matrix(tangga_pc$Hasil_baik[1,])
natur <- as.matrix(basis[1,])
perbandingan.model <- rbind(pol,tang,natur)
rownames(perbandingan.model) <- c("Polinomial Derajat 3","Piecewise Constant 15","NCS 9 Knot(df=10)")
as.data.frame(perbandingan.model[,-1])## rmse mae
## Polinomial Derajat 3 4.356984 3.259236
## Piecewise Constant 15 4.337090 3.321418
## NCS 9 Knot(df=10) 4.248032 3.169629
Berdasarkan ketiga Model diatas, Model Natural Cubic Spline memiliki nilai rsme dan mae terkecil. Artinya Model Natural Cubic Spline dengan df = 10 adalah Model terbaik untuk data Auto dengan peubah mpg vs horsepower.
plot_grid(polynom2, tangga15, nc10, labels = c("poly3", "tangga15", "nc10"), label_size = 10)4. mpg VS weight
Visualisasi Data
plot(Auto$weight, Auto$mpg,pch=19, col="#FF0000",
xlab="weight", ylab="mile per gallon")
Dari visualisasi data, dapat disimpulkan bahwa semakin berat(weight) dari suatu kendaraan, maka semakin pendek jarak yang ditempuh per galon bahan bakarnya. Jika diamati lebih detail dari visualisasi datanya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan. Pada kasus seperti ini dapat digunakan pendekatan Regresi Polinomial, Piecewise COnstant, dan Natural Cubic Splines.
A. Regresi Polinomial
Akan dilakukan cross validation untuk memilih derajat polinomial agar menghasilkan model mpg vs weight yang optimal.
set.seed(1234)
polynom_w <- function(dataset){
best = c()
degree = 1:4
for (i in degree) {
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,i,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RL <- colMeans(metric_poly) #menghitung rmse dan mae rata-rata untuk 5 folds
best = c(best,RL)
}
Hasil = matrix(best, byrow = T, ncol = 2)
colnames(Hasil) <- c("rmse","mae")
Hasil_Akhir = as.data.frame(cbind(degree,Hasil))
return(Hasil_Akhir)
}
Polynom <- polynom_w(Auto)
Polynom## degree rmse mae
## 1 1 4.310464 3.280982
## 2 2 4.150345 3.054677
## 3 3 4.161762 3.060332
## 4 4 4.172705 3.065323
Secara empiris regresi polinomial dengan derajat 2 memiliki rmse dan mae terkecil, sehingga akan dipilih regresi polinomial dengan derajat 2
polynom_w1 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_w2 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_w3 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_w4 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
plot_grid(polynom_w1, polynom_w2, polynom_w3, polynom_w4, labels=c("poly1","poly2", "poly3", "poly4"), label_size = 6)B. Fungsi Tangga (Piecewise Constant)
set.seed(1234)
tangga_w <- function(dataset){
set.seed(1234)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:15
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
base <- rbind(basedonrmse,basedonmae)
return(list(Hasil=best_tangga, Hasil_baik=base))
}
tangga_pc = tangga_w(Auto)
tangga_pc## $Hasil
## breaks rmse mae
## 1 3 4.867134 3.646155
## 2 4 4.718623 3.634154
## 3 5 4.351185 3.234412
## 4 6 4.239158 3.135169
## 5 7 4.288633 3.183777
## 6 8 4.413814 3.262903
## 7 9 4.349397 3.186285
## 8 10 4.270401 3.152336
## 9 11 4.290949 3.213980
## 10 12 4.225722 3.115210
## 11 13 4.214988 3.145547
## 12 14 4.275595 3.148278
## 13 15 4.321057 3.172404
##
## $Hasil_baik
## breaks rmse mae
## 1 13 4.214988 3.145547
## 2 12 4.225722 3.115210
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 13 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.
tangga_w12 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "black",se = F)+
theme_bw()
tangga_w13 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,13),
lty = 1, col = "black",se = F)+
theme_bw()
plot_grid(tangga_w12, tangga_w13, labels = c( "tangga12", "tangga13"), label_size = 10)
Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 13, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) tidak seperti visual gambar untuk interval 12. Jadi, pada metode piecewise constant ini dipilih interval 13.
C. Natural Cubic Spline
set.seed(1234)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis_w<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
best.spline3## df rmse mae
## 1 2 4.153898 3.074347
## 2 3 4.175257 3.080426
## 3 4 4.185575 3.085008
## 4 5 4.178767 3.087498
## 5 6 4.162512 3.093777
## 6 7 4.147974 3.060741
## 7 8 4.161714 3.077825
## 8 9 4.181676 3.076304
## 9 10 4.178939 3.080789
## 10 11 4.163568 3.074024
## 11 12 4.178507 3.091503
basis_w## df rmse mae
## 1 7 4.147974 3.060741
## 2 7 4.147974 3.060741
Banyaknya fungsi basis terpilih ada 7.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_7<-attr(ns(Auto$weight, df=7),"knots")
df_7## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
Sehingga dapat disimpulkan, bahwa Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 6 knot, yaitu knot 2074.857, 2285.429, 2635, 2986.571, 3446.143, dan 4096.286.
nc7 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color="red") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286)), col = "black", se = F) +
theme_bw()
nc7D. Perbandingan Model
Secara Empiris
pol_w <- as.matrix(Polynom[2,])
tang_w <- as.matrix(tangga_pc$Hasil_baik[1,])
natur_w <- as.matrix(basis_w[1,])
perbandingan.model <- rbind(pol_w,tang_w,natur_w)
rownames(perbandingan.model) <- c("Polinomial Derajat 2","Piecewise Constant 13","NCS 6 Knot(df=7)")
as.data.frame(perbandingan.model[,-1])## rmse mae
## Polinomial Derajat 2 4.150345 3.054677
## Piecewise Constant 13 4.214988 3.145547
## NCS 6 Knot(df=7) 4.147974 3.060741
Secara empiris jika melihat nilai rsme dan mae, Model Natural Cubic Spline memiliki nilai rsme terkecil, Polinomial Derajat 2 memiliki nilai mae terkecil. Jika kita memilih Model terbaik berdasarkan nilai rsme terkecil maka Model Natural Cubic Spline adalah Model terbaik untuk data Auto untuk peubah mpg vs weight.
plot_grid(polynom_w2, tangga_w13, nc7, labels = c("poly2", "tangga13", "nc7"), label_size = 10) # 5. mpg VS displacement
Visualisasi Data
plot(Auto$displacement, Auto$mpg,pch=19, col="#0000FF",
xlab="displacement", ylab="mile per gallon")
Dari visualisasi data, dapat disimpulkan bahwa semakin jauh jarak tempuh(displacement) dari suatu kendaraan, maka semakin pendek jarak yang ditempuh per galon bahan bakarnya. Jika diamati lebih detail dari visualisasi datanya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan. Pada kasus seperti ini dapat digunakan pendekatan Regresi Polinomial, Piecewise COnstant, dan Natural Cubic Splines.
A. Regresi Polinomial
Akan dilakukan cross validation untuk memilih derajat polinomial agar menghasilkan model mpg vs displacement yang optimal.
set.seed(1234)
polynom_d <- function(dataset){
best = c()
degree = 1:4
for (i in degree) {
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,i,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RL <- colMeans(metric_poly) #menghitung rmse dan mae rata-rata untuk 5 folds
best = c(best,RL)
}
Hasil = matrix(best, byrow = T, ncol = 2)
colnames(Hasil) <- c("rmse","mae")
Hasil_Akhir = as.data.frame(cbind(degree,Hasil))
return(Hasil_Akhir)
}
Polynom <- polynom_d(Auto)
Polynom## degree rmse mae
## 1 1 4.583160 3.516231
## 2 2 4.304907 3.159841
## 3 3 4.312431 3.170241
## 4 4 4.319214 3.164415
Secara empiris regresi polinomial dengan derajat 2 memiliki rmse dan mae terkecil, sehingga akan dipilih regresi polinomial dengan derajat 2
polynom_d1 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_d2 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_d3 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
polynom_d4 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
plot_grid(polynom_d1, polynom_d2, polynom_d3, polynom_d4, labels=c("poly1","poly2", "poly3", "poly4"), label_size = 6)B. Fungsi Tangga (Piecewise Constant)
set.seed(1234)
tangga_d <- function(dataset){
set.seed(1234)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
base <- rbind(basedonrmse,basedonmae)
return(list(Hasil=best_tangga, Hasil_baik=base))
}
tangga_dpc = tangga_d(Auto)
tangga_dpc## $Hasil
## breaks rmse mae
## 1 3 4.938258 3.673577
## 2 4 4.852811 3.628166
## 3 5 4.702897 3.502741
## 4 6 4.646896 3.431179
## 5 7 4.596168 3.375100
## 6 8 4.413880 3.247992
## 7 9 4.245699 3.095430
## 8 10 4.251116 3.090994
## 9 11 4.402738 3.249221
## 10 12 4.415702 3.259953
##
## $Hasil_baik
## breaks rmse mae
## 1 9 4.245699 3.095430
## 2 10 4.251116 3.090994
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 9 sedangkan berdasarkan mae banyaknya interval terbaik adalah 10.
tangga_d9 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()
tangga_d10 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "black",se = F)+
theme_bw()
plot_grid(tangga_d9, tangga_d10, labels = c( "tangga9", "tangga10"), label_size = 10)
Jika dilihat berdasarkan visual, interval 10 memiliki kurva fungsi tangga yang semakin mulus dan membentuk pola yang mengikuti data tetapi terdapat bentuk-bentuk yang menjadi tidak teratur. Jadi, pada metode fungsi tangga atau piecewise constant ini dipilih interval 9.
C. Natural Cubic Spline
set.seed(1234)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:4
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis_d<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
best.spline3## df rmse mae
## 1 2 4.348942 3.167333
## 2 3 4.358707 3.176956
## 3 4 4.370711 3.188265
basis_d## df rmse mae
## 1 2 4.348942 3.167333
## 2 2 4.348942 3.167333
Banyaknya fungsi basis terpilih ada 2.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_2<-attr(ns(Auto$displacement, df=2),"knots")
df_2## 50%
## 151
Sehingga dapat disimpulkan, bahwa Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 1 knot, yaitu knot 151.
nc2 <- ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color="blue") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(151)), col = "black", se = F) +
theme_bw()
nc2D. Perbandingan Model
Secara Empiris
pol_d <- as.matrix(Polynom[2,])
tang_d <- as.matrix(tangga_dpc$Hasil_baik[1,])
natur_d <- as.matrix(basis_d[1,])
perbandingan.model <- rbind(pol_d,tang_d,natur_d)
rownames(perbandingan.model) <- c("Polinomial Derajat 2","Piecewise Constant 9","NCS 2 Knot(df=1)")
as.data.frame(perbandingan.model[,-1])## rmse mae
## Polinomial Derajat 2 4.304907 3.159841
## Piecewise Constant 9 4.245699 3.095430
## NCS 2 Knot(df=1) 4.348942 3.167333
Berdasarkan ketiga Model diatas, Model piecewise Constant 9 memiliki nilai rsme dan mae terkecil. Artinya Model piecewise Constant 9 adalah Model terbaik untuk data Auto dengan peubah mpg vs displacement.
plot_grid(polynom_d2, tangga_d9, nc2, labels = c("poly2", "tangga9", "nc2"), label_size = 10)6. Perbandingan Semua Model Terbaik dari Setiap Pasangan Pembanding
Secara Empiris
perbandingan.model <- rbind(natur,natur_w,tang_d)
rownames(perbandingan.model) <- c("horsepower(NCS 9 Knot(df=10))","weight(NCS 6 Knot(df=7))","displacement(Model piecewise Constant 9)")
as.data.frame(perbandingan.model[,-1])## rmse mae
## horsepower(NCS 9 Knot(df=10)) 4.248032 3.169629
## weight(NCS 6 Knot(df=7)) 4.147974 3.060741
## displacement(Model piecewise Constant 9) 4.245699 3.095430
Bedasarkan perbandingan model dengan melihat rsme, Model Natural Cubic Spline adalah model terbaik untuk pada data Auto dengan peubah mpg vs horsepower dan mpg vs weight, sedangkan Model Model piecewise Constant adalah model terbaik untuk pada data Auto dengan peubah mpg vs displacement.
Perbandingan visual data dengan masing-masing model terbaik
nc10 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="darkgreen") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7)), col = "red", se = F) +
theme_bw()+
labs(title = "NCS 9 Knots (df=10)",
subtitle = "Auto Data Set - mpg vs horsepower")
nc7 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color="red") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286)), col = "black", se = F) +
theme_bw()+
labs(title = "NCS 6 Knots (df=7)",
subtitle = "Auto Data Set - mpg vs weight")
tangga_d9 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()+
labs(title = "Piecewise Constant 9",
subtitle = "Auto Data Set - mpg vs diplacement")
plot_grid(nc10, nc7, tangga_d9)