Moving Beyond Linearity
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(ISLR)A. Data MPG DAN Horsepower
#input data
library(ISLR)
data <- Auto
datampg <- data %>% select(mpg, horsepower)
head(datampg)## mpg horsepower
## 1 18 130
## 2 15 165
## 3 18 150
## 4 16 150
## 5 17 140
## 6 15 198
melihat sebaran data di scatter plot antara variabel respon dan variabel penjelas
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline.
regresi polynomial
set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")
df <- 2:4
best_poly <- map_dfr(df, function(i){
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,df = i),
data=datampg[x$in_id,])
pred <- predict(mod,
newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
}
)best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly## df rmse mae
## 1 2 4.351129 3.266460
## 2 3 4.355430 3.266945
## 3 4 4.367009 3.280452
#berdasarkan rmse
best_poly %>% slice_min(rmse)## df rmse mae
## 1 2 4.351129 3.26646
#berdasarkan mae
best_poly %>% slice_min(mae)## df rmse mae
## 1 2 4.351129 3.26646
model_poly_df2 <- best_poly %>% slice_min(mae)regresi spline
set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")
df <- 3:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(horsepower,df=i),
data=datampg[x$in_id,])
pred <- predict(mod,
newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)## Warning in bs(horsepower, degree = 3L, knots = numeric(0), Boundary.knots =
## c(46, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`50%` = 92.5), Boundary.knots =
## c(46, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`33.33333%` = 84, `66.66667%` =
## 110: some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`25%` = 75, `50%` = 92.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`20%` = 72, `40%` = 88, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`16.66667%` = 70, `33.33333%` =
## 84, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`14.28571%` = 68, `28.57143%`
## = 78.8571428571428, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(horsepower, degree = 3L, knots = c(`12.5%` = 67, `25%` = 75, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 3 92.41658 81.02794
## 2 4 92.42895 81.03525
## 3 5 92.42668 81.03694
## 4 6 92.42253 81.03160
## 5 7 92.42530 81.02817
## 6 8 92.43221 81.03391
## 7 9 92.43452 81.03576
## 8 10 92.42940 81.02872
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 3 92.41658 81.02794
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 3 92.41658 81.02794
model_spline_df10 <- best_spline3 %>% slice_min(mae)fungsi tangga
# cross validation
set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- datampg[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- datampg[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga## df rmse mae
## 1 2 6.147626 4.790234
## 2 3 5.775792 4.521829
## 3 4 4.985270 3.774549
## 4 5 4.712623 3.571614
## 5 6 4.644383 3.548375
## 6 7 4.554116 3.379980
## 7 8 4.437597 3.405433
## 8 9 4.549668 3.471999
## 9 10 4.589172 3.455531
#berdasarkan rmse
best_tangga %>% slice_min(rmse)## df rmse mae
## 1 8 4.437597 3.405433
#berdasarkan mae
best_tangga %>% slice_min(mae)## df rmse mae
## 1 7 4.554116 3.37998
model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)regresi natural spline
set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")
df <- 3:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=datampg[x$in_id,])
pred <- predict(mod,
newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 3 4.348006 3.261538
## 2 4 4.329461 3.257926
## 3 5 4.313312 3.246412
## 4 6 4.310011 3.237990
## 5 7 4.294671 3.229174
## 6 8 4.293369 3.232221
## 7 9 4.281877 3.211860
## 8 10 4.260316 3.192010
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 10 4.260316 3.19201
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 10 4.260316 3.19201
model_spline_df10n <- best_spline3 %>% slice_min(mae)perbandingan model
nilai_metric <- rbind(model_poly_df2,
model_tangga_k7,
model_tangga_k8,
model_spline_df10,
model_spline_df10n)
nama_model <- c("Regresi Polynomial (df=2)",
"Fungsi Tangga (k=7)",
"Fungsi Tangga (k=8)",
"Regresi Spline (df=3)",
"Regresi Natural Spline (df=10)"
)
data.frame(nama_model,nilai_metric)## nama_model df rmse mae
## 1 Regresi Polynomial (df=2) 2 4.351129 3.266460
## 2 Fungsi Tangga (k=7) 7 4.554116 3.379980
## 3 Fungsi Tangga (k=8) 8 4.437597 3.405433
## 4 Regresi Spline (df=3) 3 92.416576 81.027940
## 5 Regresi Natural Spline (df=10) 10 4.260316 3.192010
berikut bentuk kurva model terbaik untuk masing - masing metode . polynom df=2
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()Fungsi Tangga (k=7)
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()Fungsi Tangga (k=8)
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()Regresi Spline (df=3)
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~bs(x,3),
lty = 1,se=F)Regresi Natural Spline (df=10)
ggplot(datampg,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x,10),
lty = 1,se=F)Berdasarkan hasil analisis model Regresi Polynomial (df=2) merupakan model terbaik karena memiliki nilai RMSE dan MAE yang cukup kecil dan bentuk plotnya yang lebih mulus dibandingkan dengan model-model yang lain
B. Data MPG dan WEIGHT
#input data
library(ISLR)
data <- Auto
dataw <- data %>% select(mpg, weight)
head(dataw)## mpg weight
## 1 18 3504
## 2 15 3693
## 3 18 3436
## 4 16 3433
## 5 17 3449
## 6 15 4341
melihat sebaran data di scatter plot
ggplot(dataw,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline, dan regresi natural spline.
regresi polynomial
set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")
df <- 2:4
best_poly <- map_dfr(df, function(i){
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,df = i),
data=dataw[x$in_id,])
pred <- predict(mod,
newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
}
)best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly## df rmse mae
## 1 2 3074.796 2954.365
## 2 3 3074.796 2954.367
## 3 4 3074.796 2954.368
#berdasarkan rmse
best_poly %>% slice_min(rmse)## df rmse mae
## 1 4 3074.796 2954.368
#berdasarkan mae
best_poly %>% slice_min(mae)## df rmse mae
## 1 2 3074.796 2954.365
model_poly_df2 <- best_poly %>% slice_min(mae)fungsi tangga
# cross validation
set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dataw[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataw[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga## df rmse mae
## 1 2 5.570088 4.345368
## 2 3 4.868847 3.657505
## 3 4 4.706593 3.633584
## 4 5 4.389515 3.221556
## 5 6 4.201264 3.112221
## 6 7 4.279395 3.191726
## 7 8 4.370964 3.249765
## 8 9 4.341743 3.182396
## 9 10 4.329114 3.176390
#berdasarkan rmse
best_tangga %>% slice_min(rmse)## df rmse mae
## 1 6 4.201264 3.112221
#berdasarkan mae
best_tangga %>% slice_min(mae)## df rmse mae
## 1 6 4.201264 3.112221
model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)regresi spline
set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")
df <- 3:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(weight,df=i),
data=dataw[x$in_id,])
pred <- predict(mod,
newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)## Warning in bs(weight, degree = 3L, knots = numeric(0), Boundary.knots =
## c(1613, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = numeric(0), Boundary.knots =
## c(1649, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`50%` = 2792.5), Boundary.knots =
## c(1613, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`50%` = 2835), Boundary.knots =
## c(1649, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`33.33333%` = 2391, `66.66667%` =
## 3302: some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`33.33333%` = 2406, `66.66667%`
## = 3380.66666666667: some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2227.5, `50%` = 2792.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2234, `50%` = 2835, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2155, `40%` = 2577.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2166.8, `40%` = 2604, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2125, `33.33333%` =
## 2391, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2130, `33.33333%` =
## 2406, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2076.42857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2102.28571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2049.375, `25%` =
## 2227.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2051, `25%` = 2234, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 3 3074.796 2954.367
## 2 4 3074.795 2954.368
## 3 5 3074.793 2954.365
## 4 6 3074.798 2954.371
## 5 7 3074.797 2954.372
## 6 8 3074.800 2954.372
## 7 9 3074.798 2954.367
## 8 10 3074.787 2954.355
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 10 3074.787 2954.355
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 10 3074.787 2954.355
model_spline_df10 <- best_spline3 %>% slice_min(mae)regresi natural spline
set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")
df <- 3:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),
data=dataw[x$in_id,])
pred <- predict(mod,
newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 3 3074.798 2954.369
## 2 4 3074.795 2954.367
## 3 5 3074.793 2954.365
## 4 6 3074.797 2954.367
## 5 7 3074.796 2954.368
## 6 8 3074.789 2954.362
## 7 9 3074.794 2954.365
## 8 10 3074.786 2954.358
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 10 3074.786 2954.358
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 10 3074.786 2954.358
model_spline_df10n <- best_spline3 %>% slice_min(mae)perbandingan model
nilai_metric <- rbind(model_poly_df2,
model_tangga_k7,
model_spline_df10,
model_spline_df10n)
nama_model <- c("Regresi Polynomial (df=2)",
"Fungsi Tangga (k=6)",
"Regresi spline (df=10)",
"Regresi Natural Spline (df=10)"
)
data.frame(nama_model,nilai_metric)## nama_model df rmse mae
## 1 Regresi Polynomial (df=2) 2 3074.796402 2954.365281
## 2 Fungsi Tangga (k=6) 6 4.201264 3.112221
## 3 Regresi spline (df=10) 10 3074.787064 2954.354640
## 4 Regresi Natural Spline (df=10) 10 3074.785702 2954.357867
bentuk kurvanya sebagai berikut. polynom df=2
ggplot(dataw,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw() Fungsi Tangga (k=6)
ggplot(dataw,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
theme_bw()Regresi Spline (df=8)
ggplot(dataw,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~bs(x,8),
lty = 1,se=F)Regresi Naturan Spline (df=10)
ggplot(dataw,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x,10),
lty = 1,se=F)berdasarkan hasil analisis model dari fungsi tangga dengan nilai knots = 6 merupakan model terbaik karena memiliki nilai RMSE dan MAE terkecil dan bentuk dari kurvanya cenderung lebih mulus
C. Data MPG dan Displecement
#input data
library(ISLR)
data <- Auto
datad <- data %>% select(mpg, displacement)
head(datad)## mpg displacement
## 1 18 307
## 2 15 350
## 3 18 318
## 4 16 304
## 5 17 302
## 6 15 429
melihat sebaran data di scatter plot
ggplot(datad,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
theme_bw()berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline.
regresi polynomial
set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")
df <- 2:4
best_poly <- map_dfr(df, function(i){
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,df = i),
data=datad[x$in_id,])
pred <- predict(mod,
newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
}
)best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly## df rmse mae
## 1 2 4.283164 3.159670
## 2 3 4.284465 3.165588
## 3 4 4.288491 3.164067
#berdasarkan rmse
best_poly %>% slice_min(rmse)## df rmse mae
## 1 2 4.283164 3.15967
#berdasarkan mae
best_poly %>% slice_min(mae)## df rmse mae
## 1 2 4.283164 3.15967
model_poly_df2 <- best_poly %>% slice_min(mae)fungsi tangga
# cross validation
set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- datad[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- datad[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga## df rmse mae
## 1 2 5.915956 4.627152
## 2 3 4.865081 3.676866
## 3 4 4.813915 3.619919
## 4 5 4.661119 3.496662
## 5 6 4.561127 3.389387
## 6 7 4.552850 3.372940
## 7 8 4.363903 3.230513
## 8 9 4.218350 3.109206
## 9 10 4.194571 3.097485
#berdasarkan rmse
best_tangga %>% slice_min(rmse)## df rmse mae
## 1 10 4.194571 3.097485
#berdasarkan mae
best_tangga %>% slice_min(mae)## df rmse mae
## 1 10 4.194571 3.097485
model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)regresi spline
set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")
df <- 4:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(displacement,df=i),
data=datad[x$in_id,])
pred <- predict(mod,
newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$displacement
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)## Warning in bs(displacement, degree = 3L, knots = c(`50%` = 148.5),
## Boundary.knots = c(70, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`33.33333%` = 119,
## `66.66667%` = 232: some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`25%` = 104.75, `50%` =
## 148.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`20%` = 98, `40%` = 121.4, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`16.66667%` = 97, `33.33333%`
## = 119, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`14.28571%` = 97, `28.57143%`
## = 108.571428571429, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(displacement, degree = 3L, knots = c(`12.5%` = 91, `25%` =
## 104.75, : some 'x' values beyond boundary knots may cause ill-conditioned bases
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 4 203.4562 170.8607
## 2 5 203.4477 170.8566
## 3 6 203.4635 170.8661
## 4 7 203.4620 170.8728
## 5 8 203.4630 170.8735
## 6 9 203.4628 170.8710
## 7 10 203.4672 170.8799
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 5 203.4477 170.8566
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 5 203.4477 170.8566
model_spline_df10 <- best_spline3 %>% slice_min(mae)regresi natural spline
set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")
df <- 3:10
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
data=datad[x$in_id,])
pred <- predict(mod,
newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$displacement
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 3 203.4609 170.8690
## 2 4 203.4506 170.8558
## 3 5 203.4583 170.8610
## 4 6 203.4630 170.8632
## 5 7 203.4539 170.8633
## 6 8 203.4566 170.8635
## 7 9 203.4734 170.8807
## 8 10 203.4635 170.8657
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 4 203.4506 170.8558
#berdasarkan mae
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 4 203.4506 170.8558
model_spline_df10n <- best_spline3 %>% slice_min(mae)perbandingan model
nilai_metric <- rbind(model_poly_df2,
model_tangga_k7,
model_spline_df10,
model_spline_df10n)
nama_model <- c("Regresi Polynomial (df=2)",
"Fungsi Tangga (k=10)",
"Regresi spline (df=5)",
"Regresi Natural Spline (df=4)"
)
data.frame(nama_model,nilai_metric)## nama_model df rmse mae
## 1 Regresi Polynomial (df=2) 2 4.283164 3.159670
## 2 Fungsi Tangga (k=10) 10 4.194571 3.097485
## 3 Regresi spline (df=5) 5 203.447721 170.856585
## 4 Regresi Natural Spline (df=4) 4 203.450620 170.855830
berikut bentuk kurva model terbaik untuk masing - masing metode . polynom df=2
ggplot(datad,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()Fungsi Tangga (k=10)
ggplot(datad,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "blue",se = F)+
theme_bw()Regresi Spline (df=5)
ggplot(datad,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~bs(x,5),
lty = 1,se=F)Regresi Natural Spline (df=4)
ggplot(datad,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x,4),
lty = 1,se=F)berdasarkan hasil analisis model, model dari regresi polinomial dengan df=2 merupakan model terbaik karena memiliki nilai RMSE dan MAE yang kecil dan bentuk dari kurvanya cenderung lebih mulus.