REGRESI NON-LINEAR
Package
library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)## Warning: package 'rsample' was built under R version 4.0.5
library(ISLR)## Warning: package 'ISLR' was built under R version 4.0.5
library(cowplot)## Warning: package 'cowplot' was built under R version 4.0.5
library(caret) # cross-validation## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gridExtra) # combining graphs## Warning: package 'gridExtra' was built under R version 4.0.5
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(dplyr)
library(purrr)
library(dplyr)
library(knitr)## Warning: package 'knitr' was built under R version 4.0.5
library(DT)## Warning: package 'DT' was built under R version 4.0.5
library(kableExtra)## Warning: package 'kableExtra' was built under R version 4.0.5
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gam) # generalized additive models## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
Eksplorasi Data
Data Dunia
Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:
mpg : miles per gallon
horsepower: Engine horsepower
origin: Origin of car (1. American, 2. European, 3. Japanese)
Auto = Auto %>% select(mpg, horsepower, origin)
datatable(Auto)Visualisasi
plot(Auto$horsepower, Auto$mpg,pch=19, col="#0000ff",
xlab="Horse Power", ylab="mile per gallon")Ini merupakan scatter plot dari 392 kendaraan dimana dari masing-masing kendaraan diukur dua peubah/variabel, yaitu kekuatan mesin (horse power pada sumbu X) dan berapa jauh kendaraan tersebut berjalan untuk setiap galon bahan bakarnya (mile per gallon pada sumbu y).
Secara umum,kendaraan dengan horse power yang besar membutuhkan bahan bakar yang banyak yang dicirikan jarak tempuh yang semakin pendek ketika konsumsi bahan bakarnya satu galon. Yang paling kanan, kekuatan mesinnya besar, konsumsi bahan bakar besar karena jarak tempuhnya kecil.
Jika kita amati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, ada beberapa pendekatan yang bisa digunakan.
Dalam kasus kali ini akan digunakan pendekatan regresi polinomial, Piecewise constant, dan Natural cubic splines.
Regresi Data Dunia
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(1501211011)
cv.poly1 <- function(dataset){
set.seed(1501211011)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly2 <- function(dataset){
set.seed(1501211011)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(1501211011)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(1501211011)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly1(Auto),cv.poly2(Auto),cv.poly3(Auto),cv.poly4(Auto))
cvpoly## rmse mae
## [1,] 4.895036 3.841085
## [2,] 4.363221 3.267947
## [3,] 4.402158 3.286869
## [4,] 4.428254 3.310634
Secara empiris jika dilihat dari rmse terlihat bahwa polinomial pangkat 2 memberikan nilai rmse yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc(Auto)Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12.
Secara Visual
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
pc12 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc8, pc12, labels = c( "pc8", "pc12"), label_size = 10)Sebelumnya secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan mae banyaknya interval terbaik adalah 12. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 12, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak) dan membentuk pola yang mengikuti data tetapi ada bentuk2 yang menjadi tidak teratur. Jadi, pada metode piecewise constant ini dipilih interval 8.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basisBanyaknya fungsi basis terpilih ada 10.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Cross Validation
set.seed(123)
cross.val <- vfold_cv(Auto,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns9 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93.5,100,110,140,157.7)),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns9 <- colMeans(metric.spline3.ns9)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS9" = mean.metric.spline3.ns9)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"9 - NCS (df=10)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncsSecara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 9 knot, yaitu knot 67,72,80,88,93.5,100,110,140,157.7.
Visualisasi Grafik
nc4 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc9, labels = c("nc4", "nc9"), label_size = 10)Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 9 knot lebih dapat menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 9 knot.
Perbandingan Model
Secara Empiris
perbandingan.model <- rbind(cv.poly2(Auto), cv.pc(Auto)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 8","NCS 9 Knot")
perbandingan.modelBerarti, model terbaik adalah NCS dengan 9 knot.
poly2 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc8 <- ggplot(Auto,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()
plot_grid(poly2, pc8, nc9, labels = c("poly2","pc8", "nc9"), label_size = 6)Regresi Data Amerika
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
cvpoly <- rbind (cv.poly1(AutoAmerika),cv.poly2(AutoAmerika),cv.poly3(AutoAmerika),cv.poly4(AutoAmerika))
cvpoly## rmse mae
## [1,] 4.258648 3.343302
## [2,] 3.776830 2.871345
## [3,] 3.779673 2.889474
## [4,] 3.811238 2.892780
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly1 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
cv.pc(AutoAmerika)Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11
Secara Visual
pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "blue",se = F)+
theme_bw()
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc9, pc11, labels = c( "pc9", "pc11"), label_size = 6)Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 11. Jika dilihat berdasarkan visual gambar terlihat bahwa untuk interval 11, kurva dari fungsi tangga ini akan semakin mulus (anak tangganya kecil-kecil karena anak tangganya banyak). Jadi, pada metode piecewise constant ini dipilih interval 11.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")
df <- 1:13
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoAmerika[x$in_id,])
pred <- predict(mod,newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basisdiperoleh df atau banyaknya fungsi basis terpilih sebesar 8.
Menentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_8<-attr(ns(AutoAmerika$horsepower, df=8),"knots")
df_8## 12.5% 25% 37.5% 50% 62.5% 75% 87.5%
## 80 88 95 105 130 150 170
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoAmerika,v=5,strata = "mpg")
metric.spline3.ns4 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
data=AutoAmerika[x$in_id,])
pred <- predict(mod,
newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns4 <- colMeans(metric.spline3.ns4)# menghitung rata-rata untuk 5 folds
metric.spline3.ns7 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(80,88,95,105,130,150,170)),
data=AutoAmerika[x$in_id,])
pred <- predict(mod,
newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns7 <- colMeans(metric.spline3.ns7)# menghitung rata-rata untuk 5folds
nilai.cv.ns <- rbind("NCS4" = mean.metric.spline3.ns4,
"NCS7" = mean.metric.spline3.ns7)
nama.model.ncs <- c("4 - NCS Knots 80, 120, 160, 200",
"7 - NCS (df=8)")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncsSecara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 7 knot.
Visualisasi Grafik
nc4 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80, 120, 160, 200)), col = "blue", se = F) +
theme_bw()
nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
theme_bw()
plot_grid(nc4, nc7, labels = c("nc4", "nc7"), label_size = 8)Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 7 knot lebih menggambarkan data dibanding dengan 4 knot. Oleh karena itu dipilih 7 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly2(AutoAmerika), cv.pc(AutoAmerika)[1,-1], model.ncs[2,-1])
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 11","NCS 7 Knot")
perbandingan.modelBerarti, model terbaik untuk data Auto Amerika adalah Piecewise Constant dengan 11 interval.
poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()
nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,88,95,105,130,150,170)), col = "blue", se = F)+
theme_bw()
plot_grid(poly2, pc11, nc7, labels = c("poly2","pc11", "nc7"), label_size = 6)Regresi Data Eropa
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly2(AutoEropa),cv.poly3(AutoEropa),cv.poly4(AutoEropa))## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly## rmse mae
## [1,] 5.024633 3.948212
## [2,] 5.073471 4.008398
## [3,] 5.117442 4.015371
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 2 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 2.
poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 2:6
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc2(AutoEropa)## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
Secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5
Secara Visual
pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
pc10 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc5, pc10, labels = c( "pc5", "pc10"), label_size = 6)Sebelumnya secara empiris berdasarkan rmse dan mae banyaknya interval yang terbaik adalah 5. Begitu juga secara visual.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoEropa[x$in_id,])
pred <- predict(mod,newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basisMenentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_2<-attr(ns(AutoEropa$horsepower, df=2),"knots")
df_2## 50%
## 76.5
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns1 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(76.5)),
data=AutoEropa[x$in_id,])
pred <- predict(mod,
newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns1 <- colMeans(metric.spline3.ns1)# menghitung rata-rata untuk 5 folds
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(88, 105)),
data=AutoEropa[x$in_id,])
pred <- predict(mod,
newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS1" = mean.metric.spline3.ns1,
"NCS2" = mean.metric.spline3.ns2)
nama.model.ncs <- c("1 - NCS Knots 76.5",
"2 - NCS Knots 80, 150")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncsSecara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 1 knot.
Visualisasi Grafik
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()
nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(80,105)), col = "blue", se = F)+
theme_bw()
plot_grid(nc1, nc2, labels = c("nc1", "nc2"), label_size = 8)Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 1 knot lebih menggambarkan data dibanding dengan 2 knot. Oleh karena itu dipilih 1 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly2(AutoEropa), cv.pc2(AutoEropa)[1,-1], model.ncs[1,-1])## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 2","Piecewise Constant 5","NCS 1 Knot 105")
perbandingan.modelBerarti, model terbaik untuk data Auto Eropa adalah NCS 1 Knot 105.
poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc5 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()
plot_grid(poly2, pc5,nc1, labels = c("poly2","pc5","nc1"), label_size = 6)Regresi Data Jepang
Regresi Polinomial
cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal dengan regresi polinomial.
set.seed(123)
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly2(AutoJepang),cv.poly3(AutoJepang),cv.poly4(AutoJepang))## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
cvpoly## rmse mae
## [1,] 4.669195 3.580362
## [2,] 4.081574 3.074327
## [3,] 4.320975 3.240185
Secara empiris jika dilihat dari rmse dan mae terlihat bahwa polinomial pangkat 3 memberikan nilai rmse dan mae yang terkecil sehingga dipilih polinomial pangkat 3.
poly2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly2, poly3, poly4, labels=c("poly2", "poly3", "poly4"), label_size = 6)Fungsi Tangga (Piecewise Constant)
Secara Empiris
set.seed(123)
cv.pc2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "mpg")
breaks <- 2:5
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataset[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataset[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 5 folds
}
)
best_tangga <- cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
basedonrmse <- as.data.frame(best_tangga %>% slice_min(rmse))
basedonmae <- as.data.frame(best_tangga %>% slice_min(mae))
return(rbind(basedonrmse,basedonmae))
}
cv.pc2(AutoJepang)## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.
Secara Visual
pc2 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,2),
lty = 1, col = "blue",se = F)+
theme_bw()
pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(pc2, pc4, labels = c( "pc2", "pc4"), label_size = 6)Sebelumnya Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 4. Sedangkan berdasarkan mae banyaknya interval yang terbaik adalah 2.Secara visual, interval 4 lebih baik sehingga dipilih interval 4.
Natural Cubic Spline
Menentukan Banyaknya Fungsi Basis
set.seed(123)
cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:3
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=AutoJepang[x$in_id,])
pred <- predict(mod,newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basisMenentukan Knot Berdasarkan Banyaknya Fungsi Basis Terpilih
df_3<-attr(ns(AutoJepang$horsepower, df=3),"knots")
df_3## 33.33333% 66.66667%
## 68 90
Cross Validation
set.seed(123)
cross.val <- vfold_cv(AutoJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(68,90)),
data=AutoJepang[x$in_id,])
pred <- predict(mod,
newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
metric.spline3.ns3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower, knots = c(60, 88, 105)),
data=AutoJepang[x$in_id,])
pred <- predict(mod,
newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns3 <- colMeans(metric.spline3.ns3)# menghitung rata-rata untuk 5 folds
nilai.cv.ns <- rbind("NCS2" = mean.metric.spline3.ns2,
"NCS3" = mean.metric.spline3.ns3)
nama.model.ncs <- c("2 - NCS Knots 68,90",
"3 - NCS Knots 60, 88, 105")
model.ncs<-data.frame(Model = nama.model.ncs, nilai.cv.ns)
model.ncsSecara empiris berdasarkan rmse dan mae, banyaknya knot terbaik adalah 2 knot.
Visualisasi Grafik
nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
theme_bw()
nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(60, 88, 105)), col = "blue", se = F)+
theme_bw()
plot_grid(nc2, nc3, labels = c("nc2", "nc3"), label_size = 8)Secara empiris berdasarkan rmse dan mae, grafik yang memuat banyaknya 2 knot lebih menggambarkan data dibanding dengan 3 knot. Oleh karena itu dipilih 2 knot.
Perbandingan Model
perbandingan.model <- rbind(cv.poly3(AutoJepang), cv.pc2(AutoJepang)[1,-1], model.ncs[1,-1])## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
perbandingan.model$metode <- c("Polinomial Ordo 3","Piecewise Constant 4","NCS 2 Knot 68,90")
perbandingan.modelBerarti, model terbaik untuk data Auto Jepang adalah Polinomial Ordo 3.
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()
nc2 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68,90)), col = "blue", se = F) +
theme_bw()
plot_grid(poly3,pc4,nc2, labels=c("poly3","pc4","nc2"), label_size = 6)Visualisasi Model Terbaik
nc9 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(67,72,80,88,93.5,100,110,140,157.7)), col = "blue", se = F)+
theme_bw()+
labs(title = "NCS 9 Knots (df=10)",
subtitle = "Auto Data Set - World")
pc11 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw()+
labs(title = "Piecewise Constant 11",
subtitle = "Auto Data Set - Amerika")
nc1 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "blue", se = F) +
theme_bw()+
labs(title = "Natural Cubic Splines Knots 76.5 (df=2)",
subtitle = "Auto Data Set - Eropa")
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
labs(title = "Polinomial Ordo 3",
subtitle = "Auto Data Set - Jepang")
plot_grid(nc9, pc11, nc1, poly3)Referensi
Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
Blog: https://profeksis.blogspot.com/, Email: mistereko@apps.ipb.ac.id, Rpubs: https://rpubs.com/profeksis↩︎