Moving Beyond Linearity
Soal
Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya:
• Reg polinomial
• Piecewise constant
• Natural cubil splines
Lakukan hal di atas untuk masing-masing sub-set data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang).
Plot model terbaik dalam satu frame.
Berikan ulasan terhadap hasil Anda.
Library
library(tidyverse)
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(caret) # cross-validation
library(gridExtra) # combining graphs
library(ggplot2)
library(dplyr)
library(purrr)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)
library(gam) # generalized additive modelsEksplorasi Data
Data Dunia
Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:
mpg : miles per gallon
horsepower: Engine horsepower
origin: Origin of car (1. American, 2. European, 3. Japanese)
Auto = Auto %>% select(mpg, horsepower, origin)
datatable(Auto)Visualisasi
plot(Auto$horsepower, Auto$mpg,pch=19, col="#0000ff",
xlab="Horse Power", ylab="mile per gallon") Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua peubah/variabel, yaitu kekuatan mesin (horse power pada sumbu X) dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya (mile per gallon pada sumbu y).
Secara umum,kendaraan dengan horse power yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil.
Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.
Dalam kasus kali ini akan digunakan pendekatan regresi polinomial, Piecewise constant, dan Natural cubic splines.
Regresi Data Dunia
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly1 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly## rmse mae
## [1,] 4.898592 3.835773
## [2,] 4.370631 3.284748
## [3,] 4.372623 3.280150
## [4,] 4.373708 3.288706
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}cv.pc(Auto)## breaks rmse mae
## 1 8 4.492694 3.419407
## 2 12 4.552770 3.372860
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.
- Secara Visual *
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
pc12 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc8, pc12, labels = c( "pc8", "pc12"), label_size = 10) Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 8.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis## df rmse mae
## 1 10 4.291769 3.23118
## 2 10 4.291769 3.23118
Banyaknya fungsi basis terpilih ada 10.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Cross Validation
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns9 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93.5,100,110,140,157.7)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS9" = mean.metric.spline3.ns9)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"9 - NCS (df=10)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs## Model rmse mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 4.381416 3.296385
## NCS9 9 - NCS (df=10) 4.293696 3.233161
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67,72,80,88,93.5,100,110,140,157.7.
Visualisasi Grafik
nc4 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10) Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 9 knot lebih dapat menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 9 knot.
Perbandingan Model
Secara Empiris
perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.model## rmse mae metode
## 1 4.370631 3.284748 Polinomial Ordo 2
## 2 4.492694 3.419407 Piecewise Constant 8
## NCS9 4.293696 3.233161 NCS 9 Knot
Berarti, model terbaik adalah NCS dengan 9 knot.
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(poly2, pc8, nc9, labels = c("poly2","pc8", "nc9"), label_size = 6)Regresi Data Amerika
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
cvpoly <- rbind (cv.poly1(AutoAmerika),cv.poly2(AutoAmerika),cv.poly3(AutoAmerika),cv.poly4(AutoAmerika))
cvpoly## rmse mae
## [1,] 4.241013 3.326802
## [2,] 3.830277 2.886494
## [3,] 3.839806 2.899271
## [4,] 3.848930 2.896336
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
cv.pc(AutoAmerika)## breaks rmse mae
## 1 11 3.723059 2.751815
## 2 11 3.723059 2.751815
Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11
- Secara Visual *
pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "blue",se = F)+
theme_bw()
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc9, pc11, labels = c( "pc9", "pc11"), label_size = 6) Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 11, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak). Jadi, pada metode piecewise constant ini dipilih interval 11.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")
df <- 1:13
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoAmerika[x$in_id,])
pred <- predict(mod,newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis## df rmse mae
## 1 8 3.801633 2.81104
## 2 8 3.801633 2.81104
diperoleh df atau banyaknya fungsi basis terpilih sebesar 8.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_8<-attr(ns(AutoAmerika$horsepower, df=8),"knots")
df_8## 12.5% 25% 37.5% 50% 62.5% 75% 87.5%
## 80 88 95 105 130 150 170
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
data=AutoAmerika[x$in_id,])
pred <- predict(mod,
newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns7 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80,88,95,105,130,150,170)),
data=AutoAmerika[x$in_id,])
pred <- predict(mod,
newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns7 <- colMeans(metric.spline3.ns7)# menghitung rata-rata untuk 5folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS7" = mean.metric.spline3.ns7)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"7 - NCS (df=8)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs## Model rmse mae
## NCS4 4 - NCS Knots 80, 120, 160, 200 3.848149 2.900513
## NCS7 7 - NCS (df=8) 3.804332 2.810918
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 7 knot.
Visualisasi Grafik
nc4 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc7, labels = c("nc4", "nc7"), label_size = 8) Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 7 knot lebih menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 6 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly2(AutoAmerika), cv.pc(AutoAmerika)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 11","NCS 7 Knot")
perbandingan.model## rmse mae metode
## 1 3.830277 2.886494 Polinomial Ordo 2
## 2 3.723059 2.751815 Piecewise Constant 11
## NCS7 3.804332 2.810918 NCS 7 Knot
Berarti, model terbaik untuk data Auto Amerika adalah Piecewise Constant dengan 11 interval.
poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()
nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
theme_bw()
plot_grid(poly2, pc11, nc7, labels = c("poly2","pc11", "nc7"), label_size = 6)Regresi Data Eropa
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}cvpoly <- rbind (cv.poly2(AutoEropa),cv.poly3(AutoEropa),cv.poly4(AutoEropa))## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly## rmse mae
## [1,] 5.024633 3.948212
## [2,] 5.073471 4.008398
## [3,] 5.117442 4.015371
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 2:6
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}cv.pc2(AutoEropa)## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## breaks rmse mae
## 1 5 5.30521 4.068414
## 2 5 5.30521 4.068414
Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5
- Secara Visual *
pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
pc10 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc5, pc10, labels = c( "pc5", "pc10"), label_size = 6) Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5. Begitu juga secara visual.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoEropa[x$in_id,])
pred <- predict(mod,newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis## df rmse mae
## 1 2 5.022237 3.932559
## 2 2 5.022237 3.932559
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_2<-attr(ns(AutoEropa$horsepower, df=2),"knots")
df_2## 50%
## 76.5
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns1 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(76.5)),
data=AutoEropa[x$in_id,])
pred <- predict(mod,
newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns1 <- colMeans(metric.spline3.ns1)# menghitung rata-rata untuk 5 folds
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(88, 105)),
data=AutoEropa[x$in_id,])
pred <- predict(mod,
newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS1" = mean.metric.spline3.ns1,
"NCS2" = mean.metric.spline3.ns2)
nama.model.ncs <- c("1 - NCS Knots 76.5",
"2 - NCS Knots 80, 150")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs## Model rmse mae
## NCS1 1 - NCS Knots 76.5 5.021712 3.932383
## NCS2 2 - NCS Knots 80, 150 5.055292 3.986639
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 1 knot.
Visualisasi Grafik
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()
nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,105)), col = "blue", se = F)+
theme_bw()
plot_grid(nc1, nc2, labels = c("nc1", "nc2"), label_size = 8) Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 1 knot lebih menggambarkan data dibanding dengan 2 knot. Oleh karena itu dipilih 1 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly2(AutoEropa), cv.pc2(AutoEropa)[1,-1], model.ncs[1,-1])## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 5","NCS 1 Knot 105")
perbandingan.model## rmse mae metode
## 1 5.024633 3.948212 Polinomial Ordo 2
## 2 5.305210 4.068414 Piecewise Constant 5
## NCS1 5.021712 3.932383 NCS 1 Knot 105
Berarti, model terbaik untuk data Auto Eropa adalah NCS 1 Knot 105.
poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()
plot_grid(poly2, pc5,nc1, labels = c("poly2","pc5","nc1"), label_size = 6)Regresi Data Jepang
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}cvpoly <- rbind (cv.poly2(AutoJepang),cv.poly3(AutoJepang),cv.poly4(AutoJepang))## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly## rmse mae
## [1,] 4.669195 3.580362
## [2,] 4.081574 3.074327
## [3,] 4.320975 3.240185
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 3 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 3.
poly2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 2:5
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}cv.pc2(AutoJepang)## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## breaks rmse mae
## 1 4 4.155876 3.241013
## 2 2 4.398255 3.235329
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.
- Secara Visual *
pc2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,2),
lty = 1, col = "blue",se = F)+
theme_bw()
pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc2, pc4, labels = c( "pc2", "pc4"), label_size = 6) Sebelumnya Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.Secara visual, interval 4 lebih baik sehingga dipilih interval 4.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoJepang[x$in_id,])
pred <- predict(mod,newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis## df rmse mae
## 1 3 4.260727 3.230028
## 2 3 4.260727 3.230028
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_3<-attr(ns(AutoJepang$horsepower, df=3),"knots")
df_3## 33.33333% 66.66667%
## 68 90
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(68,90)),
data=AutoJepang[x$in_id,])
pred <- predict(mod,
newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
metric.spline3.ns3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(60, 88, 105)),
data=AutoJepang[x$in_id,])
pred <- predict(mod,
newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns3 <- colMeans(metric.spline3.ns3)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS2" = mean.metric.spline3.ns2,
"NCS3" = mean.metric.spline3.ns3)
nama.model.ncs <- c("2 - NCS Knots 68,90",
"3 - NCS Knots 60, 88, 105")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncs## Model rmse mae
## NCS2 2 - NCS Knots 68,90 4.251486 3.224763
## NCS3 3 - NCS Knots 60, 88, 105 4.275782 3.227630
Secara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 2 knot.
Visualisasi Grafik
nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
theme_bw()
nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(60, 88, 105)), col = "blue", se = F)+
theme_bw()
plot_grid(nc2, nc3, labels = c("nc2", "nc3"), label_size = 8) Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 2 knot lebih menggambarkan data dibanding dengan 3 knot. Oleh karena itu dipilih 2 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly3(AutoJepang), cv.pc2(AutoJepang)[1,-1], model.ncs[1,-1])## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 3","Piecewise Constant 4","NCS 2 Knot 68,90")
perbandingan.model## rmse mae metode
## 1 4.081574 3.074327 Polinomial Ordo 3
## 2 4.155876 3.241013 Piecewise Constant 4
## NCS2 4.251486 3.224763 NCS 2 Knot 68,90
Berarti, model terbaik untuk data Auto Eropa adalah NCS 2 Knot 68,90.
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()
nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
theme_bw()
plot_grid(poly3,pc4,nc2, labels=c("poly3","pc4","nc2"), label_size = 6)Visualisasi Model Terbaik
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()+
labs(title = "NCS 9 Knots (df=10)",
subtitle = "Auto Data Set - World")
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()+
labs(title = "Piecewise Constant 11",
subtitle = "Auto Data Set - Amerika")
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()+
labs(title = "Natural Cubic Splines Knots 76.5 (df=2)",
subtitle = "Auto Data Set - Eropa")
nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
theme_bw()+
labs(title = "Natural Cubic Splines Knots 86 90 (df=3)",
subtitle = "Auto Data Set - Jepang")
plot_grid(nc9, pc11, nc1, nc2)Referensi
Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
P8 STA581_Sains Data, Mahasiswa Pascasarjana Statistika dan Sains Data,IPB University, reniamelia@apps.ipb.ac.id↩︎