library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
library(splines)
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.5
library(mlr3measures)
## Warning: package 'mlr3measures' was built under R version 4.0.5
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:mlr3measures':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
library(boot)
## Warning: package 'boot' was built under R version 4.0.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
Dataset yang akan digunakan yaitu data Auto yang berasal dari package ISLR
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Soal
Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
- Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.
- Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)
Jawaban
Data
Dataset yang akan digunakan yaitu data Auto yang berasal dari package ISLR. Beberapa variabel data yang terdapat di data Auto yaitu:
mpg: miles per galloncylinders: Number of cylinders between 4 and 8displacement: Engine displacement (cu. inches)horsepower: Engine horsepowerweight: Vehicle weight (lbs.)acceleration: Time to accelerate from 0 to 60 mph (sec.)year: Model year (modulo 100)origin: Origin of car (1. American, 2. European, 3. Japanese)name: Vehicle name
Data terdiri dari 392 observasi
Tujuan
Melakukan cross validation untuk menghasilkan pemodelan antara tingkat efisiensi penggunaan bahan bakar (direpresentasikan olehmpg atau miles per gallon) optimal berdasarkan data
acceleration,displacement, danhorsepowerdaripackage ISLRdataAutodengan tiga metode:- Polynomial regression
- Piecewise constant
- Natural cubic splines
Pemodelan dilakukan untuk masing-masing variabel penjelasnya
Output
Mendapatkan model non-linier yang optimal untuk masing-masing variabel penjelasnya
mpg vs newdat
Hubungan non-linier untuk peubah-peubah dapat dilihat dari grafiknya dimana jika grafik antara satu peubah dengan peubah lainnya membentuk garis lengkung maka hubungannya non-linear. Untuk yang pertama akan dilakukan pemodelan mpg vs newdat dimana newdat adalah penjumlahan dari tiga kolom variabel prediktornya yaitu acceleration+displacement+horsepower.
Auto$newdat <- Auto$acceleration + Auto$horsepower + Auto$displacement
tibble(Auto)
## # A tibble: 392 x 10
## mpg cylinders displacement horsepower weight acceleration year origin
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 8 307 130 3504 12 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11 70 1
## 4 16 8 304 150 3433 12 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10 70 1
## 7 14 8 454 220 4354 9 70 1
## 8 14 8 440 215 4312 8.5 70 1
## 9 14 8 455 225 4425 10 70 1
## 10 15 8 390 190 3850 8.5 70 1
## # ... with 382 more rows, and 2 more variables: name <fct>, newdat <dbl>
Viasualisasi dari datanya
datanewdat <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs acceleration+horsepower+displacement")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
datanewdat
Regresi Polinomial
Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg(bahan bakar) dan variabel prediktornya yaitu jumlah dari acceleration, displacement, dan horsepower. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawabnyaini akan dicari terlebih dahulu derajat terbaiknya.
set.seed(1)
cross_val <- vfold_cv(Auto, v=5, strata = "mpg")
derajat <- 1:5
polynom <- map_dfr(derajat, function(i){
metric_polynom <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(newdat, derajat=i, raw = T), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynom
}
)
polynom <- cbind(derajat = derajat, polynom)
tibble(polynom)
## # A tibble: 25 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 4.75 3.72
## 2 2 4.97 3.89
## 3 3 4.40 3.35
## 4 4 4.01 3.11
## 5 5 4.41 3.24
## 6 1 4.13 3.15
## 7 2 4.51 3.41
## 8 3 3.80 2.77
## 9 4 3.42 2.57
## 10 5 4.40 3.09
## # ... with 15 more rows
#berdasarkan rmse
polynom %>% slice_min(rmse)
## derajat rmse mae
## 1 4 3.41539 2.566989
#berdasarkan mae
polynom %>% slice_min(mae)
## derajat rmse mae
## 1 4 3.41539 2.566989
#menghitung rata-rata untuk 10 folds
mean_polynom <- colMeans(polynom)
mean_polynom
## derajat rmse mae
## 3.000000 4.128459 3.078510
polynom$derajat <- as.factor(polynom$derajat)
polyrmse<-ggplot(data=polynom, aes(x=derajat, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polynom$derajat <- as.factor(polynom$derajat)
polymae<-ggplot(data=polynom, aes(x=derajat, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(polyrmse,polymae, ncol = 2)
Dari nilai RMSE dan MAE yang didapat bahwa derajat terbaik yang diperoleh yaitu 4.
polinomial4 <- ggplot(Auto, aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
polinomial4
mod_polinomial4 = lm(mpg ~ poly(newdat,4),data=Auto)
summary(mod_polinomial4)
##
## Call:
## lm(formula = mpg ~ poly(newdat, 4), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2652 -2.2384 -0.4524 1.9580 19.0812
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2032 115.379 < 2e-16 ***
## poly(newdat, 4)1 -125.8884 4.0233 -31.290 < 2e-16 ***
## poly(newdat, 4)2 39.4562 4.0233 9.807 < 2e-16 ***
## poly(newdat, 4)3 -10.4207 4.0233 -2.590 0.00996 **
## poly(newdat, 4)4 6.4330 4.0233 1.599 0.11065
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.023 on 387 degrees of freedom
## Multiple R-squared: 0.737, Adjusted R-squared: 0.7343
## F-statistic: 271.1 on 4 and 387 DF, p-value: < 2.2e-16
Piecewise constant
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
nbreak <- 2:6 #atau knotsnya 1-5
best_tangga <- map_dfr(nbreak, function(i){
metric_tangga <- map_dfr(cross_val$splits, function(x){
training <- Auto[x$in_id,]
training$newdat <- cut(training$newdat,i)
mod <- lm(mpg ~ newdat, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
newdat_new <- cut(testing$newdat,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(newdat=newdat_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=nbreak,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 2 5.945472 4.637213
## 2 3 5.006265 3.729575
## 3 4 4.743063 3.554744
## 4 5 4.690053 3.504499
## 5 6 4.443978 3.277245
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 6 4.443978 3.277245
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 6 4.443978 3.277245
best_tangga$breaks <- as.factor(best_tangga$breaks)
tanggarmse <- ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
best_tangga$breaks <- as.factor(best_tangga$breaks)
tanggamae<-ggplot(data=best_tangga, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(tanggarmse,tanggamae, ncol = 2)
Dari nilai RMSE dan MAE diperoleh bahwa breaks terbaik yang diperoleh yaitu 6 atau memiliki 5 knots.
Piecewise6 <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
Piecewise6
mod_tangga6 = lm(mpg ~ cut(newdat,6),data=Auto)
summary(mod_tangga6) #knots = 5
##
## Call:
## lm(formula = mpg ~ cut(newdat, 6), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5599 -2.5599 -0.2405 2.0062 18.9917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.5599 0.3633 84.11 <2e-16 ***
## cut(newdat, 6)(229,321] -6.2288 0.5959 -10.45 <2e-16 ***
## cut(newdat, 6)(321,414] -11.5515 0.6749 -17.12 <2e-16 ***
## cut(newdat, 6)(414,506] -14.4099 0.7708 -18.70 <2e-16 ***
## cut(newdat, 6)(506,599] -16.9371 0.7570 -22.37 <2e-16 ***
## cut(newdat, 6)(599,692] -16.9349 1.3226 -12.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.405 on 386 degrees of freedom
## Multiple R-squared: 0.6855, Adjusted R-squared: 0.6814
## F-statistic: 168.3 on 5 and 386 DF, p-value: < 2.2e-16
Natural Cubic Spline
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:8 #knots 1-7
best_spline <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ ns(newdat,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
}
)
best_spline <- cbind(df=df, best_spline)
tibble(best_spline)
## # A tibble: 35 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.09 3.12
## 2 3 4.48 3.36
## 3 4 3.78 2.77
## 4 5 3.39 2.53
## 5 6 4.41 3.09
## 6 7 4.04 3.11
## 7 8 4.46 3.36
## 8 2 3.74 2.77
## 9 3 3.45 2.59
## 10 4 4.39 3.07
## # ... with 25 more rows
#berdasarkan rmse
best_spline %>% slice_min(rmse)
## df rmse mae
## 1 5 3.386443 2.530711
#berdasarkan mae
best_spline %>% slice_min(mae)
## df rmse mae
## 1 5 3.386443 2.530711
best_spline$df <- as.factor(best_spline$df)
natrmse<-ggplot(data=best_spline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
natmae<-ggplot(data=best_spline, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(natrmse,natmae)
Dari nilai RMSE dan MAE yang terkecil diperoleh bahwa untuk model Natural Cubic Spline meiliki df 5.
dim(ns(Auto$newdat, df=5))
## [1] 392 5
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$newdat, df=5),"knots")
## 20% 40% 60% 80%
## 188.46 232.00 330.86 465.50
natcubspline5 <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(188.46, 232.00, 330.86, 465.50)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=5)")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
natcubspline5
mod_spline5 = lm(mpg ~ ns(newdat, knots = c(72, 88, 100, 140)),data=Auto)
summary(mod_spline5)
##
## Call:
## lm(formula = mpg ~ ns(newdat, knots = c(72, 88, 100, 140)), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0190 -2.3493 -0.4644 2.1332 19.4469
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.6230 0.6853 19.878 <2e-16
## ns(newdat, knots = c(72, 88, 100, 140))1 -89.3544 268.3772 -0.333 0.739
## ns(newdat, knots = c(72, 88, 100, 140))2 134.9217 270.5723 0.499 0.618
## ns(newdat, knots = c(72, 88, 100, 140))3 NA NA NA NA
## ns(newdat, knots = c(72, 88, 100, 140))4 NA NA NA NA
## ns(newdat, knots = c(72, 88, 100, 140))5 NA NA NA NA
##
## (Intercept) ***
## ns(newdat, knots = c(72, 88, 100, 140))1
## ns(newdat, knots = c(72, 88, 100, 140))2
## ns(newdat, knots = c(72, 88, 100, 140))3
## ns(newdat, knots = c(72, 88, 100, 140))4
## ns(newdat, knots = c(72, 88, 100, 140))5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.03 on 389 degrees of freedom
## Multiple R-squared: 0.7347, Adjusted R-squared: 0.7333
## F-statistic: 538.6 on 2 and 389 DF, p-value: < 2.2e-16
Perbandingan ketiga model
nilai_rmse <- rbind(3.41539,
4.443978,
3.386443
)
nilai_mae <- rbind(2.566989,
3.277245,
2.530711
)
nama_model <- c("Regresi polinomial (d=4)",
"Piecewise constant (breaks=6)",
"Natural cubic spline(df=5)"
)
tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))
## # A tibble: 3 x 3
## nama_model nilai_rmse nilai_mae
## <chr> <dbl> <dbl>
## 1 Regresi polinomial (d=4) 3.42 2.57
## 2 Piecewise constant (breaks=6) 4.44 3.28
## 3 Natural cubic spline(df=5) 3.39 2.53
Visualisasi ketiga model:
datanewdat <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs acceleration+horsepower+displacement")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
polinomial4 <- ggplot(Auto, aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
Piecewise6 <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
natcubspline5 <- ggplot(Auto,aes(x=newdat, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(188.46, 232.00, 330.86, 465.50)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=5)")+
xlab("acceleration+horsepower+displacement") + ylab("Miles per Gallon") +
theme_bw()
plot_grid(datanewdat, polinomial4, Piecewise6, natcubspline5)
Dari nilai
RMSE dan MAE yang terkecil dan dari visualisasi dari ketiga model diketahui bahwa model terbaik untuk mpg vs acceleration+horsepower+displacement adalah Natural Cubic Spline dengan df=5.
mpg vs acceleration
Hubungan non-linier untuk peubah kedua juga dilihat dari grafiknya dimana jika grafik antara satu peubah dengan peubah lainnya membentuk garis lengkung maka hubungannya non-linear. Data untuk variabel prediksi yang digunakan adalah data acceleration.
Visualisasi data
dataacceleration <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs acceleration")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
dataacceleration
Regresi Polinomial
Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg(bahan bakar) dan variabel prediktornya yaitu acceleration. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawab bagian a ini akan dicari terlebih dahulu derajat terbaiknya.
set.seed(1)
cross_val <- vfold_cv(Auto, v=5, strata = "mpg")
derajat <- 1:5
polynomac <- map_dfr(derajat, function(i){
metric_polynomac <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(acceleration, derajat=i, raw = T), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomac
}
)
polynomac <- cbind(derajat = derajat, polynomac)
tibble(polynomac)
## # A tibble: 25 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 6.38 5.42
## 2 2 7.60 6.06
## 3 3 7.64 6.24
## 4 4 7.02 5.87
## 5 5 6.73 5.51
## 6 1 6.71 5.46
## 7 2 7.57 6.00
## 8 3 7.56 6.20
## 9 4 6.86 5.69
## 10 5 6.62 5.43
## # ... with 15 more rows
#berdasarkan rmse
polynomac %>% slice_min(rmse)
## derajat rmse mae
## 1 1 6.378561 5.423425
#berdasarkan mae
polynomac %>% slice_min(mae)
## derajat rmse mae
## 1 1 6.508779 5.225846
#menghitung rata-rata untuk 10 folds
mean_polynomac <- colMeans(polynomac)
mean_polynomac
## derajat rmse mae
## 3.000000 7.069045 5.757471
polynomac$derajat <- as.factor(polynomac$derajat)
polyacrmse<-ggplot(data=polynomac, aes(x=derajat, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polynomac$derajat <- as.factor(polynomac$derajat)
polyacmae<-ggplot(data=polynomac, aes(x=derajat, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(polyacrmse,polyacmae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil dan grafik hasil k-fold crossppvalidation diperoleh nilai derajatnya yaitu 1.
polinomialac1 <- ggplot(Auto, aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 1), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=1")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
polinomialac1
mod_polinomialac1 = lm(mpg ~ poly(acceleration,1),data=Auto)
summary(mod_polinomialac1)
##
## Call:
## lm(formula = mpg ~ poly(acceleration, 1), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.989 -5.616 -1.199 4.801 23.239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.3576 65.564 <2e-16 ***
## poly(acceleration, 1) 65.3340 7.0802 9.228 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.08 on 390 degrees of freedom
## Multiple R-squared: 0.1792, Adjusted R-squared: 0.1771
## F-statistic: 85.15 on 1 and 390 DF, p-value: < 2.2e-16
Piecewise constant
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
nbreak <- 2:6 #atau knotsnya 1-5
best_tanggaac <- map_dfr(nbreak, function(i){
metric_tanggaac <- map_dfr(cross_val$splits, function(x){
training <- Auto[x$in_id,]
training$acceleration <- cut(training$acceleration,i)
mod <- lm(mpg ~ acceleration, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(acceleration=acceleration_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggaac
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaac <- colMeans(metric_tanggaac)
mean_metric_tanggaac
}
)
best_tanggaac <- cbind(breaks=nbreak,best_tanggaac)
# menampilkan hasil all breaks
best_tanggaac
## breaks rmse mae
## 1 2 7.524261 6.324496
## 2 3 7.002617 5.641529
## 3 4 7.251226 5.863229
## 4 5 7.278926 5.986462
## 5 6 6.999822 5.636786
#berdasarkan rmse
best_tanggaac %>% slice_min(rmse)
## breaks rmse mae
## 1 6 6.999822 5.636786
#berdasarkan mae
best_tanggaac %>% slice_min(mae)
## breaks rmse mae
## 1 6 6.999822 5.636786
best_tanggaac$breaks <- as.factor(best_tanggaac$breaks)
tanggaacrmse <- ggplot(data=best_tanggaac, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
best_tanggaac$breaks <- as.factor(best_tanggaac$breaks)
tanggaacmae<-ggplot(data=best_tanggaac, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(tanggaacrmse,tanggaacmae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil diperoleh nilai breaks terbaik yaitu 6.
Visualisasi
Piecewiseac6 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
Piecewiseac6
Hasil pemodelan:
mod_tanggaac6 = lm(mpg ~ cut(acceleration,6),data=Auto)
summary(mod_tanggaac6) #knots = 5
##
## Call:
## lm(formula = mpg ~ cut(acceleration, 6), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6725 -5.0975 -0.6837 4.3191 20.9275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.818 2.100 7.055 8.02e-12 ***
## cut(acceleration, 6)(10.8,13.6] 2.932 2.232 1.314 0.19
## cut(acceleration, 6)(13.6,16.4] 9.866 2.171 4.544 7.40e-06 ***
## cut(acceleration, 6)(16.4,19.2] 10.854 2.211 4.910 1.35e-06 ***
## cut(acceleration, 6)(19.2,22] 12.441 2.492 4.993 9.01e-07 ***
## cut(acceleration, 6)(22,24.8] 15.896 3.368 4.720 3.31e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.966 on 386 degrees of freedom
## Multiple R-squared: 0.2137, Adjusted R-squared: 0.2035
## F-statistic: 20.98 on 5 and 386 DF, p-value: < 2.2e-16
Natural Cubic Spline
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:5 #knots 1-4
best_splineac <- map_dfr(df, function(i){
metric_splineac <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ ns(acceleration,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splineac
}
)
best_splineac <- cbind(df=df, best_splineac)
tibble(best_splineac)
## # A tibble: 20 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 6.65 5.41
## 2 3 7.56 5.98
## 3 4 7.54 6.19
## 4 5 6.84 5.67
## 5 2 6.60 5.42
## 6 3 6.72 5.41
## 7 4 7.56 5.97
## 8 5 7.54 6.22
## 9 2 6.86 5.73
## 10 3 6.66 5.50
## 11 4 6.47 5.18
## 12 5 7.66 5.89
## 13 2 7.44 6.14
## 14 3 6.88 5.77
## 15 4 6.55 5.35
## 16 5 6.70 5.26
## 17 2 7.66 5.82
## 18 3 7.46 6.11
## 19 4 6.88 5.77
## 20 5 6.49 5.28
#berdasarkan rmse
best_splineac %>% slice_min(rmse)
## df rmse mae
## 1 4 6.474905 5.183302
#berdasarkan mae
best_splineac %>% slice_min(mae)
## df rmse mae
## 1 4 6.474905 5.183302
best_splineac$df <- as.factor(best_splineac$df)
natacrmse<-ggplot(data=best_splineac, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
natacmae<-ggplot(data=best_splineac, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(natacrmse,natacmae)
Berdasarkan nilai RMSE dan MAE terkecil diperoleh nilai df terbaik yaitu 4.
Bisa juga dengan fungsi df
dim(ns(Auto$acceleration, df=4))
## [1] 392 4
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$acceleration, df=4),"knots")
## 25% 50% 75%
## 13.775 15.500 17.025
natcubsplineac4 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(13.775, 15.500, 17.025)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=4)")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
natcubsplineac4
mod_splineac4 = lm(mpg ~ ns(acceleration, knots = c(13.775, 15.500, 17.025)),data=Auto)
summary(mod_splineac4)
##
## Call:
## lm(formula = mpg ~ ns(acceleration, knots = c(13.775, 15.5, 17.025)),
## data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.7104 -5.3898 -0.7676 4.7105 23.3111
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 13.222 2.962 4.464
## ns(acceleration, knots = c(13.775, 15.5, 17.025))1 13.286 2.845 4.670
## ns(acceleration, knots = c(13.775, 15.5, 17.025))2 9.512 2.298 4.139
## ns(acceleration, knots = c(13.775, 15.5, 17.025))3 18.845 6.719 2.805
## ns(acceleration, knots = c(13.775, 15.5, 17.025))4 19.184 3.419 5.611
## Pr(>|t|)
## (Intercept) 1.06e-05 ***
## ns(acceleration, knots = c(13.775, 15.5, 17.025))1 4.15e-06 ***
## ns(acceleration, knots = c(13.775, 15.5, 17.025))2 4.28e-05 ***
## ns(acceleration, knots = c(13.775, 15.5, 17.025))3 0.00529 **
## ns(acceleration, knots = c(13.775, 15.5, 17.025))4 3.84e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.951 on 387 degrees of freedom
## Multiple R-squared: 0.2149, Adjusted R-squared: 0.2068
## F-statistic: 26.48 on 4 and 387 DF, p-value: < 2.2e-16
Perbandingan ketiga model
nilai_rmseAC <- rbind(6.508779,
6.999822,
6.474905
)
nilai_maeAC <- rbind(5.225846,
5.636786,
5.183302
)
nama_modelAC <- c("Regresi polinomial (d=1)",
"Piecewise constant (breaks=6)",
"Natural cubic spline(df=4)"
)
tibble(hasil <- data.frame(nama_modelAC, nilai_rmseAC, nilai_maeAC))
## # A tibble: 3 x 3
## nama_modelAC nilai_rmseAC nilai_maeAC
## <chr> <dbl> <dbl>
## 1 Regresi polinomial (d=1) 6.51 5.23
## 2 Piecewise constant (breaks=6) 7.00 5.64
## 3 Natural cubic spline(df=4) 6.47 5.18
dataacceleration <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs acceleration")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
polinomialac1 <- ggplot(Auto, aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 1), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=1")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
Piecewiseac6 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Pie. constant (Knots=5)")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
natcubsplineac4 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(13.775, 15.500, 17.025)),
lty = 1,se=F)+
ggtitle("scatterplot Nat. Cubic Spline (df=4)")+
xlab("acceleration") + ylab("Miles per Gallon") +
theme_bw()
plot_grid(dataacceleration, polinomialac1, Piecewiseac6, natcubsplineac4)
Dari nilai
RMSE dan MAE yang terkecil dan dari visualisasi dari ketiga model diketahui bahwa model terbaik untuk mpg vs acceleration adalah Natural Cubic Spline dengan df=4.
mpg vs displacement
Hubungan non-linier untuk peubah kedua juga dilihat dari grafiknya dimana jika grafik antara satu peubah dengan peubah lainnya membentuk garis lengkung maka hubungannya non-linear. Data untuk variabel prediksi yang digunakan adalah data displacement.
Visualisasi data
datadisplacement <- ggplot(Auto,aes(x=displacement , y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs displacement")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
datadisplacement
Regresi Polinomial
Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg(bahan bakar) dan variabel prediktornya yaitu displacement. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawab bagian a ini akan dicari terlebih dahulu derajat terbaiknya.
set.seed(1)
cross_val <- vfold_cv(Auto, v=5, strata = "mpg")
derajat <- 1:5
polynomdi <- map_dfr(derajat, function(i){
metric_polynomdi <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(displacement, derajat=i, raw = T), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomdi
}
)
polynomdi <- cbind(derajat = derajat, polynomdi)
tibble(polynomdi)
## # A tibble: 25 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 4.77 3.68
## 2 2 5.07 3.95
## 3 3 4.58 3.41
## 4 4 4.06 3.20
## 5 5 4.62 3.35
## 6 1 4.42 3.26
## 7 2 4.79 3.59
## 8 3 4.25 3.07
## 9 4 3.61 2.71
## 10 5 4.70 3.22
## # ... with 15 more rows
#berdasarkan rmse
polynomdi %>% slice_min(rmse)
## derajat rmse mae
## 1 4 3.593897 2.710126
#berdasarkan mae
polynomdi %>% slice_min(mae)
## derajat rmse mae
## 1 4 3.612463 2.706115
#menghitung rata-rata untuk 10 folds
mean_polynomdi <- colMeans(polynomdi)
mean_polynomdi
## derajat rmse mae
## 3.000000 4.405920 3.243694
polynomdi$derajat <- as.factor(polynomdi$derajat)
polydirmse<-ggplot(data=polynomdi, aes(x=derajat, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polynomdi$derajat <- as.factor(polynomdi$derajat)
polydimae<-ggplot(data=polynomdi, aes(x=derajat, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(polydirmse,polydimae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil dan grafik hasil k-fold crossppvalidation diperoleh nilai derajatnya yaitu 4.
polinomialdi4 <- ggplot(Auto, aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
polinomialdi4
mod_polinomialdi4 = lm(mpg ~ poly(displacement,4),data=Auto)
summary(mod_polinomialdi4)
##
## Call:
## lm(formula = mpg ~ poly(displacement, 4), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7755 -2.3666 -0.2723 2.1005 20.4053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2207 106.217 < 2e-16 ***
## poly(displacement, 4)1 -124.2585 4.3704 -28.432 < 2e-16 ***
## poly(displacement, 4)2 31.0895 4.3704 7.114 5.5e-12 ***
## poly(displacement, 4)3 -4.4655 4.3704 -1.022 0.308
## poly(displacement, 4)4 0.7747 4.3704 0.177 0.859
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.37 on 387 degrees of freedom
## Multiple R-squared: 0.6897, Adjusted R-squared: 0.6865
## F-statistic: 215 on 4 and 387 DF, p-value: < 2.2e-16
Piecewise constant
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
nbreak <- 2:6 #atau knotsnya 1-5
best_tanggadi <- map_dfr(nbreak, function(i){
metric_tanggadi <- map_dfr(cross_val$splits, function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggadi
# menghitung rata-rata untuk 10 folds
mean_metric_tanggadi <- colMeans(metric_tanggadi)
mean_metric_tanggadi
}
)
best_tanggadi <- cbind(breaks=nbreak,best_tanggadi)
# menampilkan hasil all breaks
best_tanggadi
## breaks rmse mae
## 1 2 5.935135 4.637255
## 2 3 4.944349 3.679885
## 3 4 4.849446 3.626666
## 4 5 4.707274 3.511335
## 5 6 4.658795 3.445986
#berdasarkan rmse
best_tanggadi %>% slice_min(rmse)
## breaks rmse mae
## 1 6 4.658795 3.445986
#berdasarkan mae
best_tanggadi %>% slice_min(mae)
## breaks rmse mae
## 1 6 4.658795 3.445986
best_tanggadi$breaks <- as.factor(best_tanggadi$breaks)
tanggadirmse <- ggplot(data=best_tanggadi, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
best_tanggadi$breaks <- as.factor(best_tanggadi$breaks)
tanggadimae<-ggplot(data=best_tanggadi, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(tanggadirmse,tanggadimae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil diperoleh nilai breaks terbaik yaitu 6.
Visualisasi
Piecewisedi6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
Piecewisedi6
Hasil pemodelan:
mod_tanggadi6 = lm(mpg ~ cut(displacement,6),data=Auto)
summary(mod_tanggadi6) #knots = 5
##
## Call:
## lm(formula = mpg ~ cut(displacement, 6), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.7726 -2.6598 -0.0436 2.2274 22.0133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.7726 0.3653 81.495 < 2e-16 ***
## cut(displacement, 6)(132,197] -4.3260 0.7147 -6.053 3.38e-09 ***
## cut(displacement, 6)(197,262] -10.7320 0.6713 -15.986 < 2e-16 ***
## cut(displacement, 6)(262,326] -13.7859 0.7873 -17.510 < 2e-16 ***
## cut(displacement, 6)(326,390] -15.1108 0.8816 -17.140 < 2e-16 ***
## cut(displacement, 6)(390,455] -16.1135 1.0623 -15.169 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.679 on 386 degrees of freedom
## Multiple R-squared: 0.6453, Adjusted R-squared: 0.6407
## F-statistic: 140.4 on 5 and 386 DF, p-value: < 2.2e-16
Natural Cubic Spline
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:5 #knots 1-4
best_splinedi <- map_dfr(df, function(i){
metric_splinedi <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ ns(displacement,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splinedi
}
)
best_splinedi <- cbind(df=df, best_splinedi)
tibble(best_splinedi)
## # A tibble: 20 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.40 3.25
## 2 3 4.79 3.60
## 3 4 4.24 3.07
## 4 5 3.60 2.71
## 5 2 4.70 3.24
## 6 3 4.39 3.26
## 7 4 4.79 3.62
## 8 5 4.24 3.06
## 9 2 3.60 2.72
## 10 3 4.70 3.24
## 11 4 4.41 3.26
## 12 5 4.85 3.67
## 13 2 4.25 3.06
## 14 3 3.61 2.72
## 15 4 4.70 3.24
## 16 5 4.34 3.31
## 17 2 4.97 3.77
## 18 3 4.12 2.99
## 19 4 3.46 2.58
## 20 5 4.70 3.30
Dari perhitungan berdasarkan nilai RMSE dan MAE terkecil diperoleh df=4
#berdasarkan rmse
best_splinedi %>% slice_min(rmse)
## df rmse mae
## 1 4 3.462453 2.578798
#berdasarkan mae
best_splinedi %>% slice_min(mae)
## df rmse mae
## 1 4 3.462453 2.578798
best_splinedi$df <- as.factor(best_splinedi$df)
natdirmse<-ggplot(data=best_splinedi, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
natdimae<-ggplot(data=best_splinedi, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(natdirmse,natdimae)
Dari perhitungan berdasarkan nilai RMSE dan MAE terkecil diperoleh df=4.
atau bisa juga dengan fungsi df
dim(ns(Auto$displacement, df=4))
## [1] 392 4
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$displacement, df=4),"knots")
## 25% 50% 75%
## 105.00 151.00 275.75
natcubsplinedi4 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(105.00, 151.00, 275.75)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=4)")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
natcubsplinedi4
mod_splinedi4 = lm(mpg ~ ns(displacement, knots = c(105.00, 151.00, 275.75)),data=Auto)
summary(mod_splinedi4)
##
## Call:
## lm(formula = mpg ~ ns(displacement, knots = c(105, 151, 275.75)),
## data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7432 -2.3595 -0.2352 2.0952 20.4131
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 33.999 1.070 31.761
## ns(displacement, knots = c(105, 151, 275.75))1 -12.157 1.268 -9.588
## ns(displacement, knots = c(105, 151, 275.75))2 -17.094 1.346 -12.696
## ns(displacement, knots = c(105, 151, 275.75))3 -24.535 2.502 -9.808
## ns(displacement, knots = c(105, 151, 275.75))4 -18.385 1.341 -13.706
## Pr(>|t|)
## (Intercept) <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))1 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))2 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))3 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.368 on 387 degrees of freedom
## Multiple R-squared: 0.6901, Adjusted R-squared: 0.6869
## F-statistic: 215.4 on 4 and 387 DF, p-value: < 2.2e-16
Perbandingan ketiga model
nilai_rmseDI <- rbind(6.508779,
6.999822,
3.462453
)
nilai_maeDI <- rbind(5.225846,
5.636786,
2.578798
)
nama_modelDI <- c("Regresi polinomial (d=4)",
"Piecewise constant (breaks=6)",
"Natural cubic spline(df=4)"
)
tibble(hasil <- data.frame(nama_modelDI, nilai_rmseDI, nilai_maeDI))
## # A tibble: 3 x 3
## nama_modelDI nilai_rmseDI nilai_maeDI
## <chr> <dbl> <dbl>
## 1 Regresi polinomial (d=4) 6.51 5.23
## 2 Piecewise constant (breaks=6) 7.00 5.64
## 3 Natural cubic spline(df=4) 3.46 2.58
datadisplacement <- ggplot(Auto,aes(x=displacement , y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs displacement")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
polinomialdi4 <- ggplot(Auto, aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
Piecewisedi6 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Pie. constant (Knots=5)")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
natcubsplinedi4 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(105.00, 151.00, 275.75)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=4)")+
xlab("displacement") + ylab("Miles per Gallon") +
theme_bw()
plot_grid(datadisplacement, polinomialdi4, Piecewisedi6, natcubsplinedi4)
Dari nilai RMSE dan MAE yang terkecil dan dari visualisasi dari ketiga model diketahui bahwa model terbaik untuk
mpg vs displacement adalah Natural Cubic Spline dengan df=4.
mpg vs horsepower
Hubungan non-linier untuk peubah kedua juga dilihat dari grafiknya dimana jika grafik antara satu peubah dengan peubah lainnya membentuk garis lengkung maka hubungannya non-linear. Data untuk variabel prediksi yang digunakan adalah data horsepower.
Visualisasi data
datahorsepower <- ggplot(Auto,aes(x=horsepower , y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs horsepower")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
datahorsepower
Regresi Polinomial
Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg(bahan bakar) dan variabel prediktornya yaitu horsepower. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawab bagian a ini akan dicari terlebih dahulu derajat terbaiknya.
set.seed(1)
cross_val <- vfold_cv(Auto, v=5, strata = "mpg")
derajat <- 1:5
polynomho <- map_dfr(derajat, function(i){
metric_polynomho <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(horsepower, derajat=i, raw = T), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomho
}
)
polynomho <- cbind(derajat = derajat, polynomho)
tibble(polynomho)
## # A tibble: 25 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 5.15 4.05
## 2 2 5.49 4.29
## 3 3 4.81 3.88
## 4 4 4.62 3.58
## 5 5 4.43 3.41
## 6 1 4.49 3.31
## 7 2 5.19 3.85
## 8 3 4.09 3.14
## 9 4 4.01 2.95
## 10 5 4.06 3.11
## # ... with 15 more rows
#berdasarkan rmse
polynomho %>% slice_min(rmse)
## derajat rmse mae
## 1 4 3.912997 2.886134
#berdasarkan mae
polynomho %>% slice_min(mae)
## derajat rmse mae
## 1 4 3.912997 2.886134
#menghitung rata-rata untuk 10 folds
mean_polynomho <- colMeans(polynomho)
mean_polynomho
## derajat rmse mae
## 3.000000 4.466881 3.380643
polynomho$derajat <- as.factor(polynomho$derajat)
polyhormse<-ggplot(data=polynomho, aes(x=derajat, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polynomho$derajat <- as.factor(polynomho$derajat)
polyhomae<-ggplot(data=polynomho, aes(x=derajat, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(polyhormse,polyhomae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil dan grafik hasil k-fold crossppvalidation diperoleh nilai derajatnya yaitu 4.
visualisasi
polinomialho4 <- ggplot(Auto, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
polinomialho4
mod_polinomialho4 = lm(mpg ~ poly(horsepower,4),data=Auto)
summary(mod_polinomialho4)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 4), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8820 -2.5802 -0.1682 2.2100 16.1434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.161 <2e-16 ***
## poly(horsepower, 4)1 -120.1377 4.3727 -27.475 <2e-16 ***
## poly(horsepower, 4)2 44.0895 4.3727 10.083 <2e-16 ***
## poly(horsepower, 4)3 -3.9488 4.3727 -0.903 0.367
## poly(horsepower, 4)4 -5.1878 4.3727 -1.186 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.373 on 387 degrees of freedom
## Multiple R-squared: 0.6893, Adjusted R-squared: 0.6861
## F-statistic: 214.7 on 4 and 387 DF, p-value: < 2.2e-16
Piecewise constant
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
nbreak <- 2:6 #atau knotsnya 1-5
best_tanggaho <- map_dfr(nbreak, function(i){
metric_tanggaho <- map_dfr(cross_val$splits, function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggaho
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaho <- colMeans(metric_tanggaho)
mean_metric_tanggaho
}
)
best_tanggaho <- cbind(breaks=nbreak,best_tanggaho)
# menampilkan hasil all breaks
best_tanggaho
## breaks rmse mae
## 1 2 6.154858 4.792849
## 2 3 5.773146 4.523309
## 3 4 4.993096 3.803813
## 4 5 4.718585 3.577538
## 5 6 4.662807 3.547665
#berdasarkan rmse
best_tanggaho %>% slice_min(rmse)
## breaks rmse mae
## 1 6 4.662807 3.547665
#berdasarkan mae
best_tanggaho %>% slice_min(mae)
## breaks rmse mae
## 1 6 4.662807 3.547665
best_tanggaho$breaks <- as.factor(best_tanggaho$breaks)
tanggahormse <- ggplot(data=best_tanggaho, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
best_tanggaho$breaks <- as.factor(best_tanggaho$breaks)
tanggahomae<-ggplot(data=best_tanggaho, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(tanggahormse,tanggahomae, ncol = 2)
Berdasarkan nilai RMSE dan MAE terkecil diperoleh nilai breaks terbaik yaitu 6.
Visualisasi
Piecewiseho6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
Piecewiseho6
Hasil pemodelan:
mod_tanggaho6 = lm(mpg ~ cut(horsepower,6),data=Auto)
summary(mod_tanggaho6) #knots = 5
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 6), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0252 -2.9344 -0.1298 2.5172 14.5748
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.0252 0.4604 69.56 <2e-16 ***
## cut(horsepower, 6)(76.7,107] -8.0908 0.5947 -13.60 <2e-16 ***
## cut(horsepower, 6)(107,138] -12.2742 0.8109 -15.14 <2e-16 ***
## cut(horsepower, 6)(138,169] -16.9697 0.7850 -21.62 <2e-16 ***
## cut(horsepower, 6)(169,199] -18.3824 1.1187 -16.43 <2e-16 ***
## cut(horsepower, 6)(199,230] -19.3889 1.4821 -13.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.672 on 386 degrees of freedom
## Multiple R-squared: 0.6462, Adjusted R-squared: 0.6416
## F-statistic: 141 on 5 and 386 DF, p-value: < 2.2e-16
Natural Cubic Spline
set.seed(1)
cross_val <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:5 #knots 1-4
best_splineho <- map_dfr(df, function(i){
metric_splineho <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splineho
}
)
best_splineho <- cbind(df=df, best_splineho)
tibble(best_splineho)
## # A tibble: 20 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.48 3.32
## 2 3 5.17 3.83
## 3 4 4.05 3.10
## 4 5 3.98 2.92
## 5 2 4.05 3.11
## 6 3 4.48 3.32
## 7 4 5.18 3.84
## 8 5 4.08 3.12
## 9 2 3.99 2.93
## 10 3 4.05 3.12
## 11 4 4.58 3.34
## 12 5 5.11 3.82
## 13 2 4.04 3.07
## 14 3 3.90 2.87
## 15 4 4.02 3.11
## 16 5 4.72 3.43
## 17 2 5.05 3.78
## 18 3 3.97 3.04
## 19 4 3.96 2.91
## 20 5 3.98 3.04
#berdasarkan rmse
best_splineho %>% slice_min(rmse)
## df rmse mae
## 1 3 3.902638 2.868569
#berdasarkan mae
best_splineho %>% slice_min(mae)
## df rmse mae
## 1 3 3.902638 2.868569
best_splineho$df <- as.factor(best_splineho$df)
nathormse<-ggplot(data=best_splineho, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
nathomae<-ggplot(data=best_splineho, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(nathormse,nathomae)
Dari perhitungan berdasarkan nilai
RMSE dan MAE terkecil diperoleh df=3.
atau bisa juga dengan fungsi df
dim(ns(Auto$horsepower, df=3))
## [1] 392 3
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=3),"knots")
## 33.33333% 66.66667%
## 84 110
Visualisasi
natcubsplineho3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(84, 110)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=3)")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
natcubsplineho3
mod_splineho3 = lm(mpg ~ ns(horsepower, knots = c(84, 110)),data=Auto)
summary(mod_splineho3)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(84, 110)), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8308 -2.4068 -0.1672 2.2524 15.9184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.276 1.039 36.85 <2e-16 ***
## ns(horsepower, knots = c(84, 110))1 -21.311 1.054 -20.22 <2e-16 ***
## ns(horsepower, knots = c(84, 110))2 -35.274 2.313 -15.25 <2e-16 ***
## ns(horsepower, knots = c(84, 110))3 -19.669 1.435 -13.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.362 on 388 degrees of freedom
## Multiple R-squared: 0.6901, Adjusted R-squared: 0.6877
## F-statistic: 288 on 3 and 388 DF, p-value: < 2.2e-16
Perbandingan ketiga model
nilai_rmseHO <- rbind(3.912997,
4.662807,
3.902638
)
nilai_maeHO <- rbind(2.886134,
3.547665,
2.868569
)
nama_modelHO <- c("Regresi polinomial (d=4)",
"Piecewise constant (breaks=6)",
"Natural cubic spline(df=3)"
)
tibble(hasil <- data.frame(nama_modelHO, nilai_rmseHO, nilai_maeHO))
## # A tibble: 3 x 3
## nama_modelHO nilai_rmseHO nilai_maeHO
## <chr> <dbl> <dbl>
## 1 Regresi polinomial (d=4) 3.91 2.89
## 2 Piecewise constant (breaks=6) 4.66 3.55
## 3 Natural cubic spline(df=3) 3.90 2.87
datahorsepower <- ggplot(Auto,aes(x=horsepower , y=mpg)) +
geom_point(alpha=0.55, color="coral") +
ggtitle("scatterplot mpg vs horsepower")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
polinomialho4 <- ggplot(Auto, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
ggtitle("scatterplot Regresi polinomial d=4")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
Piecewiseho6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "blue",se = F)+
ggtitle("scatterplot Piecewise constant (Knots=5)")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
natcubsplineho3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(84, 110)),
lty = 1,se=F)+
ggtitle("scatterplot Natural Cubic Spline (df=3)")+
xlab("horsepower") + ylab("Miles per Gallon") +
theme_bw()
plot_grid(datahorsepower, polinomialho4, Piecewiseho6, natcubsplineho3)
Dari nilai RMSE dan MAE yang terkecil dan dari visualisasi dari ketiga model diketahui bahwa model terbaik untuk mpg vs acceleration adalah Natural Cubic Spline dengan df=3.
Kesimpulan
- Pemodelan terbaik hasil dari cross validation dari masing-masing model belum tentu merupakan model yang bisa diterima. Sehingga, perlu pandangan analis untuk melakukan evaluasi terhadap model-model yang diperoleh.
Referensi
Sartono, B. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
TERIMAKASIH