Praktikum STA581
Tugas Pert. 8
Library
Berikut adalah daftar library() yang digunakan.
library(ISLR)
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(MultiKink)
library(splines)
library(rsample)
library(mlr3measures)## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ggplot2)
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Data
attach(Auto)## The following object is masked from package:ggplot2:
##
## mpg
head(cbind(mpg, horsepower,acceleration,weight))## mpg horsepower acceleration weight
## [1,] 18 130 12.0 3504
## [2,] 15 165 11.5 3693
## [3,] 18 150 11.0 3436
## [4,] 16 150 12.0 3433
## [5,] 17 140 10.5 3449
## [6,] 15 198 10.0 4341
Fungsi
# Fungsi untuk 3 model
# Fungsi CV Polynomial Regression
CVPoly <- function(Data, pangkat, k){
cross_val <- vfold_cv(Data, v=k)
metric_poly <- function(x=cross_val$splits, pangkat){
result <- data.frame()
for(foldth in 1:k){
x_data <- x[[foldth]]
# 1. memodelkan data latih cv fold ke-k
mod <- lm(mpg ~ poly(exp, pangkat, raw = T), data=Data[x_data$in_id,])
# 2. predict data uji ke-k
pred <- predict(mod, newdata=Data[-x_data$in_id,])
# 3. data uji
truth <- Data[-x_data$in_id,]$mpg
# 4. hitung rmse
rmse <- mlr3measures::rmse(truth = truth, response = pred)
# 5. hitung mae
#mae <- mlr3measures::mae(truth = truth, response = pred)
# 6. hitung aic
aic <- AIC(mod)
# 7. add rmse and aic to vector
metric <- data.frame(RMSE = rmse, AIC = aic)
result <- rbind(result,metric)
}
return(colMeans(result))
}
Polinomial <- data.frame()
poli <- data.frame()
for(i in pangkat){
poli <- data.frame(RMSE = metric_poly(cross_val$splits,i)[1],
AIC = metric_poly(cross_val$splits,i)[2],
row.names = NULL)
Polinomial <- rbind(Polinomial,poli)
}
Polinomial <- cbind(pangkat, Polinomial)
return(Polinomial)
}
# Fungsi CV Piecewise Constant
CVPieCons <- function(Data, breaks, k){
cross_val <- vfold_cv(Data, v=k)
best_tangga <- map_dfr(breaks, function(i) {
metric_tangga <- map_dfr(cross_val$splits, function(x){
training <- Data[x$in_id,]
training$exp <- cut(training$exp, i)
mod <- lm(mpg ~ exp, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(
lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x))
)
testing <- Data[-x$in_id,]
exp_new <- cut(testing$exp, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(exp=exp_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
RMSE <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
AIC <- AIC(mod)
#mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(RMSE,AIC)
names(metric) <- c("RMSE","AIC")
return(metric)
})
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
return(mean_metric_tangga)
})
best_tangga <- cbind(breaks=breaks,best_tangga)
return(best_tangga)
}
# Fungsi CV Natural Cubic Splines
CVNatCubicSpl <- function(Data, df, k){
cross_val <- vfold_cv(Data, v=k)
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ ns(exp, df=i), data=Data[x$in_id,])
pred <- predict(mod, newdata=Data[-x$in_id,])
truth <- Data[-x$in_id,]$mpg
RMSE <- mlr3measures::rmse(truth = truth, response = pred)
#mae <- mlr3measures::mae(truth = truth, response = pred)
AIC <- AIC(mod)
#mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(RMSE,AIC)
names(metric) <- c("RMSE","AIC")
return(metric)
})
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
return(mean_metric_spline3)
})
best_spline3 <- cbind(df=df, best_spline3)
return(best_spline3)
}mpg vs horsepower
Data
Auto_h <- data.frame()
mpg <- Auto$mpg
exp <- Auto$horsepower
Auto_h <- data.frame(mpg,exp)
head(Auto_h)## mpg exp
## 1 18 130
## 2 15 165
## 3 18 150
## 4 16 150
## 5 17 140
## 6 15 198
Plot
# Plot Data mpg vs horsepower
plot(Auto$horsepower, Auto$mpg, pch=19, col="black",
xlab="horsepower", ylab="mile per gallon")Pemodelan
set.seed(123)
# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_h, 2:8, 10)
Polinomial## pangkat RMSE AIC
## 1 2 4.328239 2047.195
## 2 3 4.328164 2048.395
## 3 4 4.327918 2049.059
## 4 5 4.281743 2042.481
## 5 6 4.274428 2040.807
## 6 7 4.255062 2039.645
## 7 8 4.267810 2041.321
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])## pangkat RMSE AIC
## 1 7 4.255062 2039.645
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])## pangkat RMSE AIC
## 1 7 4.255062 2039.645
Berdasarkan nilai RMSE dan AIC terkecil, maka model Polynomial Regression yang paling baik adalah model dengan derajat 7. Namun terlihat pada plot model dengan derajat 7 terdapat kurva yang tidak teratur pada bagian ujung-ujung. Selain itu, perbedaan RMSE dan AIC antar derajat polinomial yang digunakan tidak jauh berbeda. Oleh karena itu, dipilih model terbaik yang memiliki RMSE dan AIC terkecil dengan kurva yang teratur pada bagian ujung-ujungnya, yaitu model dengan derajat 4.
BestPoly.h <- Polinomial[Polinomial$pangkat==4,2:3]
BestPoly.h## RMSE AIC
## 3 4.327918 2049.059
p.poly.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 7), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "Polynomial d = 7") +
theme_bw()
p.polybest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 4), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "Polynomial d = 4") +
theme_bw()
grid.arrange(p.poly.h, p.polybest.h, ncol=2)set.seed(123)
# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_h, 3:10, 10)
PieCons## breaks RMSE AIC
## 1 3 5.740349 2245.134
## 2 4 4.943248 2143.313
## 3 5 4.688531 2103.087
## 4 6 4.642824 2097.658
## 5 7 4.513490 2081.595
## 6 8 4.394890 2063.486
## 7 9 4.525875 2080.801
## 8 10 4.543401 2084.919
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])## breaks RMSE AIC
## 1 8 4.39489 2063.486
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])## breaks RMSE AIC
## 1 8 4.39489 2063.486
Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 8.
BestPie.h <- PieCons[PieCons$breaks==8,2:3]
BestPie.h## RMSE AIC
## 6 4.39489 2063.486
p.pieconsbest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ cut(x, 8), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "PC breaks = 8") +
theme_bw()
p.pieconsbest.hset.seed(123)
# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_h, 2:11, 10)
NatCubicSpl## df RMSE AIC
## 1 2 4.307281 2043.990
## 2 3 4.316587 2046.246
## 3 4 4.291779 2041.820
## 4 5 4.274236 2039.912
## 5 6 4.283307 2039.328
## 6 7 4.250278 2035.696
## 7 8 4.273412 2037.512
## 8 9 4.253688 2036.594
## 9 10 4.225783 2034.172
## 10 11 4.271550 2038.978
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])## df RMSE AIC
## 1 10 4.225783 2034.172
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])## df RMSE AIC
## 1 10 4.225783 2034.172
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_h$exp, df=10),"knots")## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 10 atau knot = (67, 72, 80, 88, 93.5, 100, 110, 140, 157.7).
p.natcubicspl.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, df = 10), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "Natural Cubic Splines df = 10") +
theme_bw()
p.natcubicspl.hNamun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots yang sudah diubah dan dikurangi berdasarkan knots dari df = 10 yang terbaik adalah model dengan knots = c(85.00,105.00,180.00). Kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10.
p.natcubicspl.h1 <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(80, 100, 110, 157.7)), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "NcS Knots 80, 100, 110, 157.7") +
theme_bw()
p.natcubicsplbest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(85.00,105.00,180.00)), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs horsepower", subtitle = "Knots 85.00,105.00,180.00") +
theme_bw()
grid.arrange(p.natcubicspl.h1, p.natcubicsplbest.h, ncol=2)set.seed(123)
cross_valh <- vfold_cv(Auto_h, v=10)
metric_spline3 <- map_dfr(cross_valh$splits, function(x){
mod <- lm(mpg ~ ns(exp, knots = c(85.00,105.00,180.00)), data=Auto_h[x$in_id,])
pred <- predict(mod, newdata=Auto_h[-x$in_id,])
truth <- Auto_h[-x$in_id,]$mpg
RMSE <- mlr3measures::rmse(truth = truth, response = pred)
#mae <- mlr3measures::mae(truth = truth, response = pred)
AIC <- AIC(mod)
#mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(RMSE,AIC)
names(metric) <- c("RMSE","AIC")
return(metric)
})
(Best3.h <- (mean_metric_spline3 <- colMeans(metric_spline3)))## RMSE AIC
## 4.318277 2047.706586
mpg vs acceleration
Data
Auto_d <- data.frame()
mpg <- Auto$mpg
exp <- Auto$displacement
Auto_d <- data.frame(mpg,exp)
head(Auto_d)## mpg exp
## 1 18 307
## 2 15 350
## 3 18 318
## 4 16 304
## 5 17 302
## 6 15 429
Plot
# Plot Data mpg vs acceleration
plot(Auto$displacement, Auto$mpg, pch=19, col="black",
xlab="discplacement", ylab="mile per gallon")Pemodelan
set.seed(123)
# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_d, 2:10, 10)
Polinomial## pangkat RMSE AIC
## 1 2 4.268993 2045.484
## 2 3 4.280184 2046.406
## 3 4 4.297514 2048.227
## 4 5 4.313349 2049.587
## 5 6 4.299478 2046.224
## 6 7 4.263486 2039.280
## 7 8 4.212933 2032.596
## 8 9 4.155368 2025.211
## 9 10 4.115678 2020.407
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])## pangkat RMSE AIC
## 1 10 4.115678 2020.407
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])## pangkat RMSE AIC
## 1 10 4.115678 2020.407
Berdasarkan nilai RMSE dan AIC terkecil, maka model Polynomial Regression yang paling baik adalah model dengan derajat 10. Namun terlihat pada plot model dengan derajat 10 terdapat bagian kurva yang tidak teratur. Selain itu, perbedaan RMSE dan AIC antar derajat polinomial yang digunakan tidak jauh berbeda. Oleh karena itu, dipilih model terbaik yang memiliki RMSE dan AIC terkecil dengan bentuk kurva yang teratur, yaitu model dengan derajat 4.
BestPoly.d <- Polinomial[Polinomial$pangkat==4,2:3]
BestPoly.d## RMSE AIC
## 3 4.297514 2048.227
p.polybest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 4), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "Polynomial d = 4") +
theme_bw()
p.poly.d2 <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 5), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "Polynomial d = 5") +
theme_bw()
p.poly.d3 <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 6), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "Polynomial d = 6") +
theme_bw()
grid.arrange(p.polybest.d, p.poly.d2, p.poly.d3, ncol=3)set.seed(123)
# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_d, 3:10, 10)
PieCons## breaks RMSE AIC
## 1 3 4.852760 2134.269
## 2 4 4.773902 2129.232
## 3 5 4.601289 2101.715
## 4 6 4.538270 2097.144
## 5 7 4.523365 2087.522
## 6 8 4.342332 2062.307
## 7 9 4.209561 2032.645
## 8 10 4.262702 2035.780
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])## breaks RMSE AIC
## 1 9 4.209561 2032.645
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])## breaks RMSE AIC
## 1 9 4.209561 2032.645
Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 9.
BestPie.d <- PieCons[PieCons$breaks==9,2:3]
BestPie.d## RMSE AIC
## 7 4.209561 2032.645
p.pieconsbest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ cut(x, 9), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "PC breaks = 9") +
theme_bw()
p.pieconsbest.dset.seed(123)
# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_d, 2:15, 10)
NatCubicSpl## df RMSE AIC
## 1 2 4.265964 2044.248
## 2 3 4.284071 2045.618
## 3 4 4.319907 2047.561
## 4 5 4.296884 2041.463
## 5 6 4.213630 2028.309
## 6 7 4.160873 2023.425
## 7 8 4.156561 2022.296
## 8 9 4.129745 2018.862
## 9 10 4.126147 2016.601
## 10 11 4.083069 2009.308
## 11 12 4.061449 2008.255
## 12 13 4.077891 2011.270
## 13 14 4.111637 2014.261
## 14 15 4.119116 2012.548
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])## df RMSE AIC
## 1 12 4.061449 2008.255
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])## df RMSE AIC
## 1 12 4.061449 2008.255
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_d$exp, df=12),"knots")## 8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
## 89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
## 75% 83.33333% 91.66667%
## 275.7500 318.0000 351.0000
Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 12 atau knot = (89.0000, 97.0000, 105.0000, 119.0000, 130.9167, 151.0000, 200.0000, 232.0000, 275.7500, 318.0000, 351.0000).
p.natcubicspl.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, df = 12), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "Natural Cubic Splines df = 12") +
theme_bw()
p.natcubicspl.dNamun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots = c(120,140,231.15,400), kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10. Jadi, model terbaik yang dipilih adalah model dengan knots = c(120,140,231.15,400).
p.natcubicspl.d1 <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(120,140,151,351)), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "NCS Knots 120, 140, 151, 351") +
theme_bw()
p.natcubicsplbest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(120,140,231.15,400)), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs displacement", subtitle = "NCS Knots 120 140 231.15 400") +
theme_bw()
grid.arrange(p.natcubicspl.d1, p.natcubicsplbest.d, ncol=2)set.seed(123)
cross_vald <- vfold_cv(Auto_d, v=10)
metric_spline3 <- map_dfr(cross_vald$splits, function(x){
mod <- lm(mpg ~ ns(exp, knots = c(120,140,231.15,400)), data=Auto_d[x$in_id,])
pred <- predict(mod, newdata=Auto_d[-x$in_id,])
truth <- Auto_d[-x$in_id,]$mpg
RMSE <- mlr3measures::rmse(truth = truth, response = pred)
#mae <- mlr3measures::mae(truth = truth, response = pred)
AIC <- AIC(mod)
#mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(RMSE,AIC)
names(metric) <- c("RMSE","AIC")
return(metric)
})
(Best3.d <- (mean_metric_spline3 <- colMeans(metric_spline3)))## RMSE AIC
## 4.31826 2048.79815
mpg vs weight
Data
Auto_w <- data.frame()
mpg <- Auto$mpg
exp <- Auto$weight
Auto_w <- data.frame(mpg,exp)
head(Auto_w)## mpg exp
## 1 18 3504
## 2 15 3693
## 3 18 3436
## 4 16 3433
## 5 17 3449
## 6 15 4341
Plot
# Plot Data mpg vs weight
plot(Auto$weight, Auto$mpg, pch=19, col="black",
xlab="weight", ylab="mile per gallon")Pemodelan
set.seed(123)
# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_w, 2:8, 10)
Polinomial## pangkat RMSE AIC
## 1 2 4.083225 2014.527
## 2 3 4.086215 2016.491
## 3 4 4.096702 2017.995
## 4 5 4.110388 2019.112
## 5 6 4.121985 2020.423
## 6 7 4.115721 2020.775
## 7 8 4.125211 2022.722
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])## pangkat RMSE AIC
## 1 2 4.083225 2014.527
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])## pangkat RMSE AIC
## 1 2 4.083225 2014.527
Berdasarkan nilai RMSE dan AIC terkecil serta hasil visualisasi model polinomial dengan derajat 2, maka model Polynomial Regression yang paling baik adalah model dengan derajat 2.
BestPoly.w <- Polinomial[Polinomial$pangkat==2,2:3]
BestPoly.w## RMSE AIC
## 1 4.083225 2014.527
p.polybest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ poly(x, 2), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs weight", subtitle = "Polynomial d = 2") +
theme_bw()
p.polybest.wset.seed(123)
# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_w, 3:10, 10)
PieCons## breaks RMSE AIC
## 1 3 4.831174 2122.146
## 2 4 4.668478 2096.702
## 3 5 4.356145 2055.344
## 4 6 4.172338 2023.892
## 5 7 4.246013 2038.817
## 6 8 4.312464 2053.980
## 7 9 4.263321 2042.495
## 8 10 4.261454 2047.200
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])## breaks RMSE AIC
## 1 6 4.172338 2023.892
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])## breaks RMSE AIC
## 1 6 4.172338 2023.892
Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 6.
BestPie.w <- PieCons[PieCons$breaks==6,2:3]
BestPie.w## RMSE AIC
## 4 4.172338 2023.892
p.pieconsbest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ cut(x, 6), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs weight", subtitle = "PC breaks = 6") +
theme_bw()
p.pieconsbest.wset.seed(123)
# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_w, 2:10, 10)
NatCubicSpl## df RMSE AIC
## 1 2 4.085148 2014.337
## 2 3 4.088267 2016.159
## 3 4 4.112470 2017.890
## 4 5 4.117601 2019.397
## 5 6 4.109858 2016.819
## 6 7 4.094677 2013.657
## 7 8 4.105949 2017.100
## 8 9 4.114397 2018.312
## 9 10 4.114696 2017.134
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])## df RMSE AIC
## 1 2 4.085148 2014.337
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])## df RMSE AIC
## 1 7 4.094677 2013.657
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_w$exp, df=7),"knots")## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 7 atau knot = (2074.857, 2285.429, 2635.000, 2986.571, 3446.143, 4096.286).
p.natcubicspl.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, df = 7), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs weight", subtitle = "NCS df = 7") +
theme_bw()
p.natcubicspl.wNamun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots = c(120,140,231.15,400), kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10. Jadi, model terbaik yang dipilih adalah model dengan knots = c(120,140,231.15,400).
p.natcubicsplbest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(2285.429, 2635.000, 2986.571, 3446.143, 4096.286)), lty = 1, col = "blue",se = F) +
labs(title = "mpg vs weight", subtitle = "NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286") +
theme_bw()
p.natcubicsplbest.wset.seed(123)
cross_valw <- vfold_cv(Auto_w, v=10)
metric_spline3 <- map_dfr(cross_valw$splits, function(x){
mod <- lm(mpg ~ ns(exp, knots = c(2285.429, 2635.000, 2986.571, 3446.143, 4096.286)), data=Auto_w[x$in_id,])
pred <- predict(mod, newdata=Auto_w[-x$in_id,])
truth <- Auto_w[-x$in_id,]$mpg
RMSE <- mlr3measures::rmse(truth = truth, response = pred)
#mae <- mlr3measures::mae(truth = truth, response = pred)
AIC <- AIC(mod)
#mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(RMSE,AIC)
names(metric) <- c("RMSE","AIC")
return(metric)
})
(Best3.w <- (mean_metric_spline3 <- colMeans(metric_spline3)))## RMSE AIC
## 4.126551 2021.601414
Perbandingan Model
mpg vs horsepower
grid.arrange(p.polybest.h, p.pieconsbest.h, p.natcubicsplbest.h, ncol=3)data.h <- rbind(BestPoly.h,BestPie.h,Best3.h)
data.h$nama_metode <- c("Polynomial d = 4", "PC breaks = 8", "NCS Knots = 85 105 180")
data.h %>% arrange(RMSE)## RMSE AIC nama_metode
## 31 4.318277 2047.707 NCS Knots = 85 105 180
## 3 4.327918 2049.059 Polynomial d = 4
## 6 4.394890 2063.486 PC breaks = 8
mpg vs displacement
grid.arrange(p.polybest.d, p.pieconsbest.d, p.natcubicsplbest.d, ncol=3)data.d <- rbind(BestPoly.d,BestPie.d,Best3.d)
data.d$nama_metode <- c("Polynomial d = 4", "PC breaks = 9", "NCS Knots = 120 140 231.15 400")
data.d %>% arrange(RMSE)## RMSE AIC nama_metode
## 7 4.209561 2032.645 PC breaks = 9
## 3 4.297514 2048.227 Polynomial d = 4
## 31 4.318260 2048.798 NCS Knots = 120 140 231.15 400
mpg vs weight
grid.arrange(p.polybest.w, p.pieconsbest.w, p.natcubicsplbest.w, ncol=3)data.w <- rbind(BestPoly.w,BestPie.w,Best3.w)
data.w$nama_metode <- c("Polynomial d = 2", "PC breaks = 6", "NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286")
data.w %>% arrange(RMSE)## RMSE AIC nama_metode
## 1 4.083225 2014.527 Polynomial d = 2
## 2 4.126551 2021.601 NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286
## 3 4.172338 2023.892 PC breaks = 6
Kesimpulan
Terdapat pola hubungan non linier antara mpg dengan horsepower, displacement, dan weight dilihat dari beberapa grafik sebelumnya. Model terbaik untuk masing-masing hubungan antara mpg vs horsepower, mpg vs displacement, mpg vs displacement secara berurutan adalah model natural cubic splines dengan knots 85 105 180, piecewise constant dengan breaks 9, dan polynomial dengan derajat 2.
grid.arrange(p.natcubicsplbest.h, p.pieconsbest.d, p.polybest.w, ncol=3) ```