Data yang digunakan pada tugas ini adalah data Auto yang
terdapat pada package ISLR dengan peubah respon
mpg dan tiga peubah prediktor, yaitu
displacement, horsepower dan
weight. Keterangan:
* mpg : miles per gallon
* displacement : Engine displacement (cu. inches)
* horsepower : Engine horsepower
* weight : Vehicle weight (lbs.)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(ISLR)
library(grid)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
Berikut tampilan data yang akan digunakan.
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Masing-masing variabel prediktor tersebut memiliki hubungan
non-linier terhadap mpg sebagai variabel respon. Pernyataan
tersebut didukung dari hasil visualisasi antara masing-masing variabel
prediktor dengan mpg berikut.
plot(displacement, mpg, pch=19, col="coral",
xlab="Displacement", ylab="mile per gallon",main="Dispalcement vs MPG")
plot(horsepower, mpg, pch=19, col="blue",
xlab="Horse Power", ylab="mile per gallon",main="Horse Power vs MPG")
plot(weight, mpg, pch=19, col="pink",
xlab="Weight", ylab="mile per gallon",main="Weight vs MPG")
Dari ketiga plot di atas, terlihat bahwa terdapat hubungan non-linier
antara mpg dengan masing-masing variabel prediktor sehingga
analisis akan dilanjutkan dengan melakukan pemodelan regresi non-linier
untuk masing-masing prediktor.
Adapun model-model yang akan digunakan antara lain:
1. Regresi Polinomial
2. Regresi Fungsi Tangga
3. Regresi Spline
Pemilihan model terbaik pada pemodelan akan dilakukan baik secara visual maupun empiris. Secara visual, akan dilihat berdasarkan plot data aktual dan garis regresi yang dihasilkan dari pemodelan. Sedangkan secara empiris, akan dilakukan cross-validation dengan k = 10 dan kemudian akan dilihat berdasarkan nilai RMSE dan MAE terkecil.
mod_polinomial2.dis = lm(mpg ~ poly(displacement,2,raw = T),data=Auto)
mod_polinomial2.hp = lm(mpg ~ poly(horsepower,2,raw = T),data=Auto)
mod_polinomial2.weight = lm(mpg ~ poly(weight,2,raw = T),data=Auto)
#summary(mod_polinomial2.dis)
#summary(mod_polinomial2.hp)
#summary(mod_polinomial2.weight)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "red",se = F)+ggtitle("Poly 2 mpg~displacement")+
theme_bw()
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ggtitle("Poly 2 mpg~horsepower")+
theme_bw()
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green",se = F)+ggtitle("Poly 2 mpg~weight")+
theme_bw()
mod_polinomial3.dis = lm(mpg ~ poly(displacement,3,raw = T),data=Auto)
mod_polinomial3.hp = lm(mpg ~ poly(horsepower,3,raw = T),data=Auto)
mod_polinomial3.weight = lm(mpg ~ poly(weight,3,raw = T),data=Auto)
#summary(mod_polinomial3.dis)
#summary(mod_polinomial3.hp)
#summary(mod_polinomial3.weight)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "red",se = F)+ggtitle("Poly 3 mpg~displacement")+
theme_bw()
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+ggtitle("Poly 3 mpg~horsepower")+
theme_bw()
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "green",se = F)+ggtitle("Poly 3 mpg~weight")+
theme_bw()
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod.dis <- lm(mpg ~ poly(displacement,2,raw = T),data=Auto[x$in_id,])
mod.hp <- lm(mpg ~ poly(horsepower,2,raw = T),data=Auto[x$in_id,])
mod.weight <- lm(mpg ~ poly(weight,2,raw = T),data=Auto[x$in_id,])
pred.dis <- predict(mod.dis,newdata=Auto[-x$in_id,])
pred.hp <- predict(mod.hp,newdata=Auto[-x$in_id,])
pred.weight <- predict(mod.weight,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse.dis <- mlr3measures::rmse(truth = truth, response = pred.dis)
rmse.hp <- mlr3measures::rmse(truth = truth, response = pred.hp)
rmse.weight <- mlr3measures::rmse(truth = truth, response = pred.weight)
mae.dis <- mlr3measures::mae(truth = truth, response = pred.dis)
mae.hp <- mlr3measures::mae(truth = truth, response = pred.hp)
mae.weight <- mlr3measures::mae(truth = truth, response = pred.weight)
metric <- c(rmse.dis,mae.dis,rmse.hp,mae.hp,rmse.weight,mae.weight)
names(metric) <- c("rmse dis2","mae dis2","rmse hp2","mae hp2","rmse weight2", "mae weight2")
return(metric)
}
)
data.frame(metric_poly2)
## rmse.dis2 mae.dis2 rmse.hp2 mae.hp2 rmse.weight2 mae.weight2
## 1 3.821170 2.768978 4.058485 3.028373 3.557567 2.700732
## 2 5.868405 3.951945 4.620720 3.593403 4.832198 3.537171
## 3 3.805425 3.088912 4.283869 3.220835 3.921937 3.052908
## 4 3.699022 2.839425 4.186409 3.129596 3.772209 2.583726
## 5 4.013289 3.105959 4.971414 3.937423 4.043579 3.192477
## 6 4.315145 2.998973 3.528283 2.513300 4.529304 3.132098
## 7 5.274271 3.975061 4.923325 3.662118 4.667154 3.656144
## 8 4.704788 3.580378 4.065941 2.957232 4.626366 3.361480
## 9 4.519795 3.159066 3.787004 3.073819 4.021003 2.855509
## 10 2.810326 2.127999 5.085835 3.548505 3.528454 2.534795
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod.dis <- lm(mpg ~ poly(displacement,3,raw = T),data=Auto[x$in_id,])
mod.hp <- lm(mpg ~ poly(horsepower,3,raw = T),data=Auto[x$in_id,])
mod.weight <- lm(mpg ~ poly(weight,3,raw = T),data=Auto[x$in_id,])
pred.dis <- predict(mod.dis,newdata=Auto[-x$in_id,])
pred.hp <- predict(mod.hp,newdata=Auto[-x$in_id,])
pred.weight <- predict(mod.weight,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse.dis <- mlr3measures::rmse(truth = truth, response = pred.dis)
rmse.hp <- mlr3measures::rmse(truth = truth, response = pred.hp)
rmse.weight <- mlr3measures::rmse(truth = truth, response = pred.weight)
mae.dis <- mlr3measures::mae(truth = truth, response = pred.dis)
mae.hp <- mlr3measures::mae(truth = truth, response = pred.hp)
mae.weight <- mlr3measures::mae(truth = truth, response = pred.weight)
metric <- c(rmse.dis,mae.dis,rmse.hp,mae.hp,rmse.weight,mae.weight)
names(metric) <- c("rmse dis3","mae dis3","rmse hp3","mae hp3","rmse weight3", "mae weight3")
return(metric)
}
)
data.frame(metric_poly3)
## rmse.dis3 mae.dis3 rmse.hp3 mae.hp3 rmse.weight3 mae.weight3
## 1 3.800751 2.738098 4.048212 2.979671 3.568933 2.707171
## 2 5.853798 3.964494 4.648284 3.658177 4.832378 3.536677
## 3 3.802139 3.082411 4.299771 3.235221 3.921907 3.053014
## 4 3.670864 2.794655 4.209665 3.126702 3.772497 2.584045
## 5 4.066468 3.189759 5.006186 3.967390 4.044497 3.189447
## 6 4.287484 2.998818 3.504959 2.499097 4.529853 3.130777
## 7 5.274636 3.975702 4.912824 3.624020 4.698155 3.693690
## 8 4.703387 3.618820 4.074157 2.971644 4.660445 3.385904
## 9 4.521460 3.148937 3.773608 3.064925 4.024692 2.859357
## 10 2.863662 2.144184 5.076637 3.542601 3.531467 2.540120
Mencari nilai rata-rata RMSE dan MAE dari setiap setiap peubah
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly2
## rmse dis2 mae dis2 rmse hp2 mae hp2 rmse weight2 mae weight2
## 4.283164 3.159670 4.351129 3.266460 4.149977 3.060704
mean_metric_poly3
## rmse dis3 mae dis3 rmse hp3 mae hp3 rmse weight3 mae weight3
## 4.284465 3.165588 4.355430 3.266945 4.158482 3.068020
Secara visual, akan dilakukan pemodelan untuk fungsi tangga dengan 5 titik knot.
mod_tangga.dis = lm(mpg ~ cut(displacement,5),data=Auto)
mod_tangga.hp = lm(mpg ~ cut(horsepower,5),data=Auto)
mod_tangga.weight = lm(mpg ~ cut(weight,5),data=Auto)
#summary(mod_tangga.dis)
#summary(mod_tangga.hp)
#summary(mod_tangga.weight)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "red",se = F)+ggtitle("Cut mpg~displacement")+
theme_bw()
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+ggtitle("Cut mpg~horsepower")+
theme_bw()
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "green",se = F)+ggtitle("Cut mpg~weight")+
theme_bw()
Secara empiris, akan dilakukan pemodelan fungsi tangga untuk banyak knot mulai dari 3 hingga 10. Kemudian akan dihitung nilai RMSE dan MAE dari setiap model dan akan dipilih satu model fungsi tangga terbaik yang nantinya akan dibandingkan dengan model lainnya.
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:10
best_tangga.dis <- map_dfr(breaks, function(i){
metric_tangga.dis <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse displacement","mae displacement")
return(metric)
}
)
metric_tangga.dis
# menghitung rata-rata untuk 10 folds
mean_metric_tangga.dis <- colMeans(metric_tangga.dis)
mean_metric_tangga.dis
})
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:10
best_tangga.hp <- map_dfr(breaks, function(i){
metric_tangga.hp <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse horsepower","mae horsepower")
return(metric)
}
)
metric_tangga.hp
# menghitung rata-rata untuk 10 folds
mean_metric_tangga.hp <- colMeans(metric_tangga.hp)
mean_metric_tangga.hp
})
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:10
best_tangga.weight <- map_dfr(breaks, function(i){
metric_tangga.weight <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse weight","mae weight")
return(metric)
}
)
metric_tangga.weight
# menghitung rata-rata untuk 10 folds
mean_metric_tangga.weight <- colMeans(metric_tangga.weight)
mean_metric_tangga.weight
})
Menampilkan RMSE dan MAE pada break 3-10 pada setiap peubah
best_tangga <- cbind(breaks=breaks,best_tangga.dis, best_tangga.hp, best_tangga.weight)
# menampilkan hasil all breaks
best_tangga
## breaks rmse displacement mae displacement rmse horsepower mae horsepower
## 1 3 4.865081 3.676866 5.775792 4.521829
## 2 4 4.813915 3.619919 4.985270 3.774549
## 3 5 4.661119 3.496662 4.712623 3.571614
## 4 6 4.561127 3.389387 4.644383 3.548375
## 5 7 4.552850 3.372940 4.554116 3.379980
## 6 8 4.363903 3.230513 4.437597 3.405433
## 7 9 4.218350 3.109206 4.549668 3.471999
## 8 10 4.194571 3.097485 4.589172 3.455531
## rmse weight mae weight
## 1 4.868847 3.657505
## 2 4.706593 3.633584
## 3 4.389515 3.221556
## 4 4.201264 3.112221
## 5 4.279395 3.191726
## 6 4.370964 3.249765
## 7 4.341743 3.182396
## 8 4.329114 3.176390
Berdasarkan nilai RMSE dan MAE di atas, diperoleh model fungsi terbaik pada peubah displacement dengan banyaknya titik knot = 10, peubah horsepower pada knot = 8, dan peubah weight pada knot = 6. Berikut tampilan model tersebut secara visual.
#berdasarkan rmse
#best_tangga %>% slice_min(`rmse displacement`)
#berdasarkan mae
#best_tangga %>% slice_min(`mae displacement`)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "red",se = F)+ggtitle("mpg~displacement knots=10")+
theme_bw()
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+ggtitle("mpg~horsepower knots=8")+
theme_bw()
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "green",se = F)+ggtitle("mpg~weight knots=6")+
theme_bw()
Visualisasi pada model b-spline akan dilakukan untuk df=6 (k=3) dengan letak titik knot yang ditentukan oleh komputer. (Berdasarkan nilai persentil)
mod_spline3.dis = lm(mpg ~ bs(displacement, df=6),data=Auto)
mod_spline3.hp = lm(mpg ~ bs(horsepower, df=6),data=Auto)
mod_spline3.weight = lm(mpg ~ bs(weight, df=6),data=Auto)
#summary(mod_spline3.dis)
#summary(mod_spline3.hp)
#summary(mod_spline3.weight)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, df=6),
lty = 1,se = F)+ggtitle("bspline mpg~displacement")
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, df=6),
lty = 1,se = F)+ggtitle("bspline mpg~horsepower")
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, df=6),
lty = 1,se = F)+ggtitle("bspline mpg~weight")
Visualisasi pada model natural spline akan dilakukan untuk df=6 (k=3) dengan letak titik knot yang ditentukan oleh komputer. (Berdasarkan nilai persentil)
mod_spline3ns.dis = lm(mpg ~ ns(displacement, df=6),data=Auto)
mod_spline3ns.hp = lm(mpg ~ ns(horsepower, df=6),data=Auto)
mod_spline3ns.weight = lm(mpg ~ ns(weight, df=6),data=Auto)
#summary(mod_spline3ns.dis)
#summary(mod_spline3ns.hp)
#summary(mod_spline3ns.weight)
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=6),
lty = 1,se = F)+ggtitle("nspline mpg~displacement")
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=6),
lty = 1,se = F)+ggtitle("nspline mpg~horsepower")
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=6),
lty = 1,se = F)+ggtitle("nspline mpg~weight")
Secara empiris, akan dilakukan pemodelan b-spline untuk banyak knot mulai dari 3 hingga 9 (df bernilai 6 hingga 12). Kemudian akan dihitung nilai RMSE dan MAE dari setiap model dan akan dipilih satu model b-spline terbaik yang nantinya akan dibandingkan dengan model lainnya.
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_spline3.dis <- map_dfr(df, function(i){
metric_spline3.dis <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse displacement","mae displacement")
return(metric)
}
)
metric_spline3.dis
# menghitung rata-rata untuk 10 folds
mean_metric_spline3.dis <- colMeans(metric_spline3.dis)
mean_metric_spline3.dis
}
)
## Warning in bs(displacement, degree = 3L, knots = c(`25%` = 104.75, `50%` =
## 148.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`20%` = 98, `40%` = 121.4, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`16.66667%` = 97,
## `33.33333%` = 119, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`14.28571%` = 97,
## `28.57143%` = 108.571428571429, : some 'x' values beyond boundary knots may
## cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`12.5%` = 91, `25%` =
## 104.75, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`11.11111%` = 91,
## `22.22222%` = 98, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`10%` = 90, `20%` = 98, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_spline3.hp <- map_dfr(df, function(i){
metric_spline3.hp <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse horsepower","mae horsepower")
return(metric)
}
)
metric_spline3.hp
# menghitung rata-rata untuk 10 folds
mean_metric_spline3.hp <- colMeans(metric_spline3.hp)
mean_metric_spline3.hp
}
)
## Warning in bs(horsepower, degree = 3L, knots = c(`25%` = 75, `50%` = 92.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`20%` = 72, `40%` = 88, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`16.66667%` = 70, `33.33333%`
## = 84, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`14.28571%` = 68, `28.57143%`
## = 78.8571428571428, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`12.5%` = 67, `25%` = 75, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`11.11111%` = 67, `22.22222%`
## = 75, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`10%` = 66.3, `20%` = 72, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_spline3.weight <- map_dfr(df, function(i){
metric_spline3.weight <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse weight","mae weight")
return(metric)
}
)
metric_spline3.weight
# menghitung rata-rata untuk 10 folds
mean_metric_spline3.weight <- colMeans(metric_spline3.weight)
mean_metric_spline3.weight
}
)
## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2227.5, `50%` = 2792.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2234, `50%` = 2835, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2155, `40%` = 2577.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2166.8, `40%` = 2604, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2125, `33.33333%` =
## 2391, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2130, `33.33333%` =
## 2406, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2076.42857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2102.28571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2049.375, `25%` =
## 2227.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2051, `25%` = 2234, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2020, `22.22222%` =
## 2190, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2021.66666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1995.5, `20%` = 2155, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1996, `20%` = 2166.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
Menampilkan RMSE dan MAE dengan DF 6-12 pada setiap peubah
best_spline3 <- cbind(df=df,best_spline3.dis,best_spline3.hp,best_spline3.weight)
# menampilkan hasil all breaks
best_spline3
## df rmse displacement mae displacement rmse horsepower mae horsepower
## 1 6 4.128716 3.114608 4.300410 3.237509
## 2 7 4.145760 3.130172 4.319374 3.249466
## 3 8 4.143006 3.125376 4.319212 3.241480
## 4 9 4.149539 3.126854 4.319173 3.241117
## 5 10 4.156805 3.130215 4.333922 3.254318
## 6 11 4.180934 3.140893 4.323218 3.229614
## 7 12 4.163395 3.133797 4.300275 3.205095
## rmse weight mae weight
## 1 4.169320 3.097698
## 2 4.182779 3.098524
## 3 4.182940 3.120736
## 4 4.140723 3.065880
## 5 4.156925 3.096623
## 6 4.168753 3.095133
## 7 4.172423 3.093106
Dari output di atas, diperoleh model terbaik b-spline pada peubah penjelas displacement dengan df = 6, peubah horsepower dengan df = 12, dan peubah weight dengan df = 9.
Secara empiris, akan dilakukan pemodelan natural spline untuk banyak knot mulai dari 3 hingga 9 (df bernilai 6 hingga 12). Kemudian akan dihitung nilai RMSE dan MAE dari setiap model dan akan dipilih satu model natural spline terbaik yang nantinya akan dibandingkan dengan model lainnya.
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_nspline3.dis <- map_dfr(df, function(i){
metric_nspline3.dis <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse displacement","mae displacement")
return(metric)
}
)
metric_nspline3.dis
# menghitung rata-rata untuk 10 folds
mean_metric_nspline3.dis <- colMeans(metric_nspline3.dis)
mean_metric_nspline3.dis
}
)
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_nspline3.hp <- map_dfr(df, function(i){
metric_nspline3.hp <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse horsepower","mae horsepower")
return(metric)
}
)
metric_nspline3.hp
# menghitung rata-rata untuk 10 folds
mean_metric_nspline3.hp <- colMeans(metric_nspline3.hp)
mean_metric_nspline3.hp
}
)
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 6:12
best_nspline3.weight <- map_dfr(df, function(i){
metric_nspline3.weight <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse weight","mae weight")
return(metric)
}
)
metric_nspline3.weight
# menghitung rata-rata untuk 10 folds
mean_metric_nspline3.weight <- colMeans(metric_nspline3.weight)
mean_metric_nspline3.weight
}
)
Menampilkan nilai RMSE dan MAE dengan df 6-12 pada setiap peubah
best_nspline3 <- cbind(df=df,best_nspline3.dis,best_nspline3.hp,best_nspline3.weight)
# menampilkan hasil all breaks
best_nspline3
## df rmse displacement mae displacement rmse horsepower mae horsepower
## 1 6 4.197436 3.127948 4.310011 3.237990
## 2 7 4.184108 3.123310 4.294671 3.229174
## 3 8 4.165765 3.134441 4.293369 3.232221
## 4 9 4.163914 3.133642 4.281877 3.211860
## 5 10 4.133528 3.116680 4.260316 3.192010
## 6 11 4.114617 3.082797 4.288667 3.220958
## 7 12 4.074090 3.040305 4.285063 3.192363
## rmse weight mae weight
## 1 4.171277 3.102353
## 2 4.141103 3.061136
## 3 4.170552 3.098226
## 4 4.175108 3.091378
## 5 4.168457 3.089614
## 6 4.162104 3.087247
## 7 4.157470 3.086991
Dari output di atas, diperoleh model terbaik natural spline pada peubah penjelas displacement dengan df = 12, peubah horsepower df = 10, dan peubah weight def = 7.
Setelah dilakukan pemodelan untuk berbagai model non-linier, maka selanjutnya akan dipilih satu model terbaik secara empiris, yaitu berdasarkan nilai RMSE dan MAE terkecil.
Dataframe model peubah displacement
nilai_metric.dis <- rbind(mean_metric_poly2[1],
mean_metric_poly3[1],
best_tangga %>% select(2:3) %>% slice_min(`rmse displacement`),
best_spline3 %>% select(2:3) %>% slice_min(`rmse displacement`),
best_nspline3 %>% select(2:3) %>% slice_min(`rmse displacement`)
)
nama_model <- c("Poly2","Poly3","Tangga(breaks=10)","b-spline(df=6)",
"natural spline(df=12)")
ringkasan.dis <- data.frame(nama_model,nilai_metric.dis)
ringkasan.dis
## nama_model rmse.displacement mae.displacement
## 1 Poly2 4.283164 4.283164
## 2 Poly3 4.284465 4.284465
## 3 Tangga(breaks=10) 4.194571 3.097485
## 4 b-spline(df=6) 4.128716 3.114608
## 5 natural spline(df=12) 4.074090 3.040305
Dataframe model peubah horsepower
nilai_metric.hp <- rbind(mean_metric_poly2[3],
mean_metric_poly3[3],
best_tangga %>% select(4:5) %>% slice_min(`rmse horsepower`),
best_spline3 %>% select(4:5) %>% slice_min(`rmse horsepower`),
best_nspline3 %>% select(4:5) %>% slice_min(`rmse horsepower`)
)
nama_model <- c("Poly2","Poly3","Tangga(breaks=8)","b-spline(df=12)",
"natural spline(df=10)")
ringkasan.hp <- data.frame(nama_model,nilai_metric.hp)
ringkasan.hp
## nama_model rmse.horsepower mae.horsepower
## 1 Poly2 4.351129 4.351129
## 2 Poly3 4.355430 4.355430
## 3 Tangga(breaks=8) 4.437597 3.405433
## 4 b-spline(df=12) 4.300275 3.205095
## 5 natural spline(df=10) 4.260316 3.192010
Dataframe model peubah weight
nilai_metric.weight <- rbind(mean_metric_poly2[5],
mean_metric_poly3[5],
best_tangga %>% select(6:7) %>% slice_min(`rmse weight`),
best_spline3 %>% select(6:7) %>% slice_min(`rmse weight`),
best_nspline3 %>% select(6:7) %>% slice_min(`rmse weight`)
)
nama_model <- c("Poly2","Poly3","Tangga(breaks=6)","b-spline(df=9)",
"natural spline(df=7)")
ringkasan.hp.weight <- data.frame(nama_model,nilai_metric.weight)
ringkasan.hp.weight
## nama_model rmse.weight mae.weight
## 1 Poly2 4.149977 4.149977
## 2 Poly3 4.158482 4.158482
## 3 Tangga(breaks=6) 4.201264 3.112221
## 4 b-spline(df=9) 4.140723 3.065880
## 5 natural spline(df=7) 4.141103 3.061136
Dari tabel ringkasan di atas, diperoleh model terbaik untuk
memodelkan hubungan antara peubah respon mpg dan peubah
prediktor displacement adalah menggunakan natural
spline dengan df = 12, hubungan antara peubah respon
mpg dan peubah prediktor horsepower adalah
menggunakan natural spline dengan df = 10, dan hubungan
antara peubah respon mpg dan peubah prediktor
weight adalah menggunakan natural spline dengan df
= 7. Berikut plot antara data aktual serta garis
regresinya.
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,se=F) + ggtitle("Best model mpg~displacement with natural cubic spline df = 12")
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,se=F) + ggtitle("Best model mpg~horsepower with natural cubic spline df = 10")
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,se=F) + ggtitle("Best model mpg~weight with natural cubic spline df = 7")