library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(ggplot2)
library(dplyr)
library(purrr)
library(knitr)
library(DT)
data yang digunakan pada project ini adalah data Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. variabel respon yang akan digunakan adalah variabel mpg dan variabel prediktor yang akan digunakan adalah housepower,
view(Auto)
Auto = Auto %>% select(mpg, horsepower, displacement, weight)
datatable(Auto)
Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk mendukung jawaban anda
#Visualisasi MPG-Horsepower
plot(Auto$horsepower, Auto$mpg,pch=19, col="#0000ff",
xlab="Horse Power", ylab="mile per gallon")
Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu horse power pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan horse power yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.
#Visualisasi MPG-Displacement
plot(Auto$displacement, Auto$mpg,pch=19, col="#0000ff",
xlab="displacement", ylab="mile per gallon")
Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu displacement pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan displacement yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, displacementnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.
#Visualisasi MPG-Weight
plot(Auto$weight, Auto$mpg,pch=19, col="#0000ff",
xlab="weight", ylab="mile per gallon")
Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua variabel, yaitu weight pada sumbu X dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya, mile per gallon pada sumbu y. Secara umum,kendaraan dengan weight yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil. Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.
tentukan model non-linear terbaik untuk masing pasangan peubah (secara visual dan/atau secara empiris)
Regresi Polinomial Cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly1 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly
## rmse mae
## [1,] 4.898592 3.835773
## [2,] 4.370631 3.284748
## [3,] 4.372623 3.280150
## [4,] 4.373708 3.288706
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)
set.seed(123)
cv.pc <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)
## breaks rmse mae
## 1 8 4.492694 3.419407
## 2 12 4.552770 3.372860
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
pc12 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc8, pc12, labels = c( "pc8", "pc12"), label_size = 10)
Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 8.
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis
## df rmse mae
## 1 10 4.291769 3.23118
## 2 10 4.291769 3.23118
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns9 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93.5,100,110,140,157.7)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS9" = mean.metric.spline3.ns9)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"9 - NCS (df=10)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs
## Model rmse mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 4.381416 3.296385
## NCS9 9 - NCS (df=10) 4.293696 3.233161
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67,72,80,88,93.5,100,110,140,157.7.
nc4 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)
Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 9 knot lebih dapat menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 9 knot.
perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model
## rmse mae metode
## 1 4.370631 3.284748 Polinomial Ordo 2
## 2 4.492694 3.419407 Piecewise Constant 8
## NCS9 4.293696 3.233161 NCS 9 Knot
Berarti, model terbaik adalah NCS dengan 9 knot.
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(poly2, pc8, nc9, labels = c("poly2","pc8", "nc9"), label_size = 6)
Cross validation untuk menghasilkan pemodelan mpg vs displacement optimal dengan regresi polinomial.
set.seed(123)
cv.poly5 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly6 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly7 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly8 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly5(Auto),cv.poly6(Auto),cv.poly7(Auto),cv.poly8(Auto))
cvpoly
## rmse mae
## [1,] 4.583160 3.516231
## [2,] 4.304907 3.159841
## [3,] 4.312431 3.170241
## [4,] 4.319214 3.164415
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis
## df rmse mae
## 1 12 4.096341 3.044499
## 2 12 4.096341 3.044499
df_12<-attr(ns(Auto$displacement, df=10),"knots")
df_12
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 90 98 112 122 151 225 250 305 350
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement, knots = c(80, 120, 160, 200)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns9 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement, knots = c(90,98,112,122,151,225,250,305,350)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS9" = mean.metric.spline3.ns9)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"9 - NCS (df=10)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs
## Model rmse mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 4.200113 3.122310
## NCS9 9 - NCS (df=10) 4.188001 3.143539
Secara empiris berdasarkan rmse banyaknya knot terbaik adalah 9 knot dan mae banyaknya knot terbaik adalah 4 knot.
nc4 <- ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc9 <- ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 4 knot lebih dapat menggambarkan data dibanding dengan 9 knot. Oleh karena itu dipilih 4 knot.
set.seed(123)
cv.pc2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 2:6
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc2(Auto)
## breaks rmse mae
## 1 6 4.603742 3.40405
## 2 6 4.603742 3.40405
Secara empiris berdasarkan rmse dan mae banyaknya interval terbaik adalah 6.
pc6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
theme_bw()
pc6
perbandingan.model <- rbind(cv.poly6(Auto), cv.pc2(Auto)[1,-1], model.ncs[1,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 6","NCS 4 Knot")
perbandingan.model
## rmse mae metode
## 1 4.304907 3.159841 Polinomial Ordo 2
## 2 4.603742 3.404050 Piecewise Constant 6
## NCS4 4.200113 3.122310 NCS 4 Knot
Berarti, model terbaik adalah NCS dengan 4 knot.
poly2 <- ggplot(Auto,aes(x=displacement, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
theme_bw()
nc4 <- ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
plot_grid(poly2, pc6, nc4, labels = c("poly2","pc6", "nc4"), label_size = 6)
yang dibingungkan, pada gambar justru lebih baik resresi polinomial, yang menjadi bahan untuk dipelajari lebih lanjut
Cross validation untuk menghasilkan pemodelan mpg vs weight optimal dengan regresi polinomial.
set.seed(123)
cv.poly1 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly
## rmse mae
## [1,] 4.310464 3.280982
## [2,] 4.150345 3.054677
## [3,] 4.161762 3.060332
## [4,] 4.172705 3.065323
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)
set.seed(123)
cv.pc <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)
## breaks rmse mae
## 1 6 4.248970 3.132424
## 2 12 4.255478 3.116471
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 6 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.
pc6 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
theme_bw()
pc12 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc6, pc12, labels = c( "pc6", "pc12"), label_size = 10)
Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 6.
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis
## df rmse mae
## 1 2 4.149920 3.054651
## 2 7 4.158297 3.048696
df_2<-attr(ns(Auto$weight, df=2),"knots")
df_2
## 50%
## 2803.5
df_7<-attr(ns(Auto$weight, df=7),"knots")
df_7
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(weight, knots = c(2803.5
)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
metric.spline3.ns7 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(weight, knots = c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns7 <- colMeans(metric.spline3.ns7)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS2" = mean.metric.spline3.ns2,
"NCS7" = mean.metric.spline3.ns7)
nama.model.ncs <- c("2 - NCS ",
"7 - NCS ")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs
## Model rmse mae
## NCS2 2 - NCS 4.150152 3.054797
## NCS7 7 - NCS 4.166053 3.056082
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 2 knot.
nc2 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(2803.5)), col = "blue", se = F) +
theme_bw()
nc7 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)), col = "blue", se = F)+
theme_bw()
plot_grid(nc2, nc7, labels = c("nc2", "nc7"), label_size = 10)
Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 2 knot lebih dapat menggambarkan data dibanding dengan 7 knot. Oleh karena itu dipilih 2 knot.
perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[1,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model
## rmse mae metode
## 1 4.150345 3.054677 Polinomial Ordo 2
## 2 4.248970 3.132424 Piecewise Constant 8
## NCS2 4.150152 3.054797 NCS 9 Knot
Berarti, model terbaik adalah NCS dengan 2 knot.
poly2 <- ggplot(Auto,aes(x=weight, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc6 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
theme_bw()
nc2 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(2803.5)), col = "blue", se = F) +
theme_bw()
plot_grid(poly2, pc6, nc2, labels = c("poly2","pc6", "nc2"), label_size = 6)
setelah menentukan model non-linear untuk masing pasangan peubah, didapatkan bahwa pada pasangan MPG-Horsepower model yang terbaik adalah NCS dengan 9 knot, kemudian pada pasangan MPG-displacement model yang terbaik adalah NCS dengan 4 knot, dan pada pasangan MPG-weight model yang terbaik adalah NCS dengan 2 knot.