Library
library(MultiKink)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(ISLR)
library(splines)
Nonlinear Regression Data Auto
Data
Data yang digunakan merupakan Auto Data Set dari package
ISLR dengan 392 amatan. Data terdiri dari tiga kolom yaitu,
mpg, horsepower, dan origin.
mpg: miles per galon
horsepower: Engine horsepower
origin: Origin of car (1. American, 2. European, 3. Japanese)
AutoData = Auto %>% select(mpg, horsepower, origin)
tibble(AutoData)
Jika kita gambarkan dalam bentuk scatterplot
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.
Regresi Linear
reglinear = lm(horsepower~mpg,data=AutoData)
summarylinear <- summary(reglinear)
summarylinear
##
## Call:
## lm(formula = horsepower ~ mpg, data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.892 -15.716 -2.094 13.108 96.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 194.4756 3.8732 50.21 <2e-16 ***
## mpg -3.8389 0.1568 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.19 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
summarylinear$r.squared
## [1] 0.6059483
tabeldata <- rbind(data.frame("y_aktual"=AutoData$horsepower,
"y_pred"=predict(reglinear),
"residuals"=residuals(reglinear)))
head(tabeldata)
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "blue",se = F)+
theme_bw()
Regresi Polinomial
Derajat 2 (ordo 2)
modelpolinomial = lm(horsepower ~ poly(mpg,2,raw = T),
data=AutoData)
summary(modelpolinomial)
##
## Call:
## lm(formula = horsepower ~ poly(mpg, 2, raw = T), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.52 -11.24 -0.11 9.47 92.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 302.06824 9.25689 32.63 <2e-16 ***
## poly(mpg, 2, raw = T)1 -13.32406 0.77456 -17.20 <2e-16 ***
## poly(mpg, 2, raw = T)2 0.18804 0.01513 12.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.49 on 389 degrees of freedom
## Multiple R-squared: 0.718, Adjusted R-squared: 0.7165
## F-statistic: 495.1 on 2 and 389 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = T)+
theme_bw()
Derajat 3 (ordo 3)
modelpolinomial3 = lm(horsepower~ poly(mpg,3,raw = T),data=AutoData)
summary(modelpolinomial3)
##
## Call:
## lm(formula = horsepower ~ poly(mpg, 3, raw = T), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.935 -11.251 -1.338 9.324 95.459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 429.581461 23.823018 18.032 < 2e-16 ***
## poly(mpg, 3, raw = T)1 -30.131244 3.006538 -10.022 < 2e-16 ***
## poly(mpg, 3, raw = T)2 0.866793 0.118533 7.313 1.51e-12 ***
## poly(mpg, 3, raw = T)3 -0.008506 0.001474 -5.770 1.62e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.69 on 388 degrees of freedom
## Multiple R-squared: 0.7403, Adjusted R-squared: 0.7382
## F-statistic: 368.6 on 3 and 388 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = T)+
theme_bw()
Regresi Fungsi Tangga
Fungsi Tangga (7)
modeltangga = lm(horsepower ~ cut(mpg,7),data=AutoData)
summary(modeltangga)
##
## Call:
## lm(formula = horsepower ~ cut(mpg, 7), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.48 -12.96 -2.25 10.31 105.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 169.692 2.960 57.32 <2e-16 ***
## cut(mpg, 7)(14.4,19.7] -45.208 3.669 -12.32 <2e-16 ***
## cut(mpg, 7)(19.7,25.1] -74.908 3.734 -20.06 <2e-16 ***
## cut(mpg, 7)(25.1,30.5] -88.317 3.885 -22.73 <2e-16 ***
## cut(mpg, 7)(30.5,35.9] -96.865 4.186 -23.14 <2e-16 ***
## cut(mpg, 7)(35.9,41.2] -100.442 5.268 -19.07 <2e-16 ***
## cut(mpg, 7)(41.2,46.6] -111.978 8.594 -13.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.35 on 385 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6924
## F-statistic: 147.7 on 6 and 385 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()
Fungsi Tangga (10)
modeltangga10= lm(mpg ~ cut(horsepower,10),data=AutoData)
summary(modeltangga10)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 10), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5663 -2.8183 -0.2674 2.3709 16.0337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.8963 0.8803 38.503 < 2e-16 ***
## cut(horsepower, 10)(64.4,82.8] -3.3300 0.9976 -3.338 0.000927 ***
## cut(horsepower, 10)(82.8,101] -10.0780 0.9744 -10.343 < 2e-16 ***
## cut(horsepower, 10)(101,120] -13.0463 1.1183 -11.666 < 2e-16 ***
## cut(horsepower, 10)(120,138] -16.0113 1.3495 -11.864 < 2e-16 ***
## cut(horsepower, 10)(138,156] -18.6289 1.1090 -16.798 < 2e-16 ***
## cut(horsepower, 10)(156,175] -19.8040 1.5442 -12.825 < 2e-16 ***
## cut(horsepower, 10)(175,193] -20.5392 1.5065 -13.633 < 2e-16 ***
## cut(horsepower, 10)(193,212] -22.0963 2.2271 -9.921 < 2e-16 ***
## cut(horsepower, 10)(212,230] -20.5213 1.8414 -11.145 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.574 on 382 degrees of freedom
## Multiple R-squared: 0.6644, Adjusted R-squared: 0.6565
## F-statistic: 84.03 on 9 and 382 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "blue",se = F)+
theme_bw()
Perbandingan Model Nonlinear Regression
Membandingkan model
MSE = function(pred,actual){
mean((pred-actual)^2)
}
model <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)","Polynomial (Ordo=3)", "Tangga breaks=7", "Tangga breaks=10"), "AIC" = c(AIC(reglinear),AIC(modelpolinomial),AIC(modelpolinomial3),AIC(modeltangga),AIC(modeltangga10)), "MSE" = c(MSE(predict(reglinear),AutoData$mpg),
MSE(predict(modelpolinomial),AutoData$mpg),
MSE(predict(modelpolinomial3),AutoData$mpg),
MSE(predict(modeltangga),AutoData$mpg),
MSE(predict(modeltangga10),AutoData$mpg)))
knitr::kable(model)
| Model | AIC | MSE |
|---|---|---|
| Linear | 3614.324 | 7987.55223 |
| Polynomial (Ordo=2) | 3485.217 | 8153.09082 |
| Polynomial (Ordo=3) | 3454.949 | 8186.02560 |
| Tangga breaks=7 | 3521.117 | 8103.01147 |
| Tangga breaks=10 | 2316.374 | 20.39147 |
Berdasarkan nilai AIC dan MSE, model terbaik saat ini ialah model tangga dengan jumlah 10 selang karena memiliki nilai AIC dan MSE paling rendah dibandingkan model lainnya.
Regresi Spline
#nilai knots yang ditentukan oleh komputer
attr(bs(AutoData$mpg, df=6),"knots")
## 25% 50% 75%
## 17.00 22.75 29.00
b-spline
Menggunakan knot yang ditentukan fungsi komputer
knotvalue = attr(bs(AutoData$mpg, df=6),"knots")
modelspline1 = lm(horsepower ~ bs(mpg, knots =knotvalue ),data=AutoData)
summary(modelspline1)
##
## Call:
## lm(formula = horsepower ~ bs(mpg, knots = knotvalue), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.043 -10.103 -0.818 8.075 95.778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 195.012 13.181 14.795 < 2e-16 ***
## bs(mpg, knots = knotvalue)1 2.792 18.879 0.148 0.882
## bs(mpg, knots = knotvalue)2 -80.488 12.649 -6.363 5.62e-10 ***
## bs(mpg, knots = knotvalue)3 -103.774 14.645 -7.086 6.62e-12 ***
## bs(mpg, knots = knotvalue)4 -126.115 14.600 -8.638 < 2e-16 ***
## bs(mpg, knots = knotvalue)5 -127.143 18.836 -6.750 5.45e-11 ***
## bs(mpg, knots = knotvalue)6 -141.413 18.226 -7.759 7.77e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.56 on 385 degrees of freedom
## Multiple R-squared: 0.7457, Adjusted R-squared: 0.7417
## F-statistic: 188.1 on 6 and 385 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = knotvalue),
lty = 1,se = F)
Natural Spline
modelsplinenatural = lm(horsepower ~ ns(mpg, knots = knotvalue),data=AutoData)
summary(modelsplinenatural)
##
## Call:
## lm(formula = horsepower ~ ns(mpg, knots = knotvalue), data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.461 -10.731 -1.433 8.759 95.650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 218.000 6.786 32.13 <2e-16 ***
## ns(mpg, knots = knotvalue)1 -130.511 6.470 -20.17 <2e-16 ***
## ns(mpg, knots = knotvalue)2 -109.368 6.183 -17.69 <2e-16 ***
## ns(mpg, knots = knotvalue)3 -237.134 15.815 -14.99 <2e-16 ***
## ns(mpg, knots = knotvalue)4 -115.116 8.615 -13.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.62 on 387 degrees of freedom
## Multiple R-squared: 0.7429, Adjusted R-squared: 0.7403
## F-statistic: 279.6 on 4 and 387 DF, p-value: < 2.2e-16
ggplot(AutoData,aes(x=mpg, y=horsepower)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = knotvalue),
lty = 1,se=F)
## Smoothing Spline
Fungsi smooth.spline() dapat digunakan untuk melakukan
smoothing spline pada R
modelsmooth<- with(data = AutoData,smooth.spline(horsepower,mpg))
modelsmooth
## Call:
## smooth.spline(x = horsepower, y = mpg)
##
## Smoothing Parameter spar= 0.2648644 lambda= 5.101567e-07 (12 iterations)
## Equivalent Degrees of Freedom (Df): 42.14987
## Penalized Criterion (RSS): 1117.657
## GCV: 18.64132
Fungsi smooth.spline secara otomatis memilih paramter
lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized
Cross-Validation (GCV). Banyaknya folds (lipatan) pada CV dan GCV ini
adalah sejumlah observasinya.
predictiondata <- broom::augment(modelsmooth)
ggplot(predictiondata,aes(x=x,y=y))+
geom_point(alpha=0.55, color="black")+
geom_line(aes(y=.fitted),col="blue",
lty=1)+
xlab("mpg")+
ylab("horsepower")+
theme_bw()
Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:
modelsmoothlambda <- data.frame(lambda=seq(0,5,by=0.5)) %>%
group_by(lambda) %>%
do(broom::augment(with(data = AutoData,smooth.spline(mpg,horsepower,lambda = .$lambda))))
p <- ggplot(modelsmoothlambda,
aes(x=x,y=y))+
geom_line(aes(y=.fitted),
col="blue",
lty=1
)+
facet_wrap(~lambda)
p
Jika kita tentukan df=7, maka hasil kurva model
smooth.spline akan lebih merepresentasikan data
modelsmooth2 <- with(data = AutoData,smooth.spline(horsepower,mpg,df=7))
modelsmooth2
## Call:
## smooth.spline(x = horsepower, y = mpg, df = 7)
##
## Smoothing Parameter spar= 0.7927002 lambda= 0.003320238 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.999082
## Penalized Criterion (RSS): 2424.819
## GCV: 18.84972
pred_data <- broom::augment(modelsmooth2)
ggplot(pred_data,aes(x=x,y=y))+
geom_point(alpha=0.55, color="black")+
geom_line(aes(y=.fitted),col="blue",
lty=1)+
xlab("mpg")+
ylab("horsepower")+
theme_bw()
LOESS
Kali ini kita akan mencoba melakukan pendekatan local regression
dengan fungsi loess().
modloess <- loess(horsepower~mpg,
data = AutoData)
summary(modloess)
## Call:
## loess(formula = horsepower ~ mpg, data = AutoData)
##
## Number of Observations: 392
## Equivalent Number of Parameters: 4.8
## Residual Standard Error: 19.58
## Trace of smoother matrix: 5.24 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:
modloessspan <- data.frame(span=seq(0.1,5,by=0.5)) %>%
group_by(span) %>%
do(broom::augment(loess(horsepower~mpg,
data = AutoData,span=.$span)))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 6.6352e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
p2 <- ggplot(modloessspan,
aes(x=mpg,y=horsepower))+
geom_line(aes(y=.fitted),
col="blue",
lty=1
)+
facet_wrap(~span)
p2
Pendekatan ini juga dapat dilakukan dengan fungsi
stat_smooth() pada package ggplot2.
library(ggplot2)
ggplot(AutoData, aes(horsepower,mpg)) +
geom_point(alpha=0.5,color="black") +
stat_smooth(method='loess',
formula=y~x,
span = 0.75,
col="blue",
lty=1,
se=F)
Evaluasi Model Menggunakan Cross Validation
Regresi Linear
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
metriclinear <- map_dfr(crossval$splits,
function(x){
mod <- lm(horsepower ~ mpg,data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metriclinear
# menghitung rata-rata untuk 10 folds
meanmetriclinear <- colMeans(metriclinear)
meanmetriclinear
## rmse mae
## 24.04699 18.43851
Polynomial Derajat 2 (ordo 2)
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
metricpoly <- map_dfr(crossval$splits,
function(x){
mod <- lm(horsepower ~ poly(mpg,2,raw = T),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metricpoly
# menghitung rata-rata untuk 10 folds
meanmetricpoly <- colMeans(metricpoly)
meanmetricpoly
## rmse mae
## 20.35446 14.88426
Polynomial Derajat 3 (ordo 3)
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
metricpoly3 <- map_dfr(crossval$splits,
function(x){
mod <- lm(horsepower ~ poly(mpg,3,raw = T),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metricpoly3
# menghitung rata-rata untuk 10 folds
meanmetricpoly3 <- colMeans(metricpoly3)
meanmetricpoly3
## rmse mae
## 19.48501 14.29354
Fungsi Tangga
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
breaks <- 7:10
besttangga <- map_dfr(breaks, function(i){
metrictangga <- map_dfr(crossval$splits,
function(x){
training <- AutoData[x$in_id,]
training$mpg <- cut(training$mpg,i)
mod <- lm(horsepower ~ mpg ,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- AutoData[-x$in_id,]
mpg_new <- cut(testing$mpg,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(mpg=mpg_new))
truth <- testing$horsepower
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metrictangga
# menghitung rata-rata untuk 10 folds
meanmetrictangga <- colMeans(metrictangga)
meanmetrictangga
}
)
besttangga <- cbind(breaks=breaks,besttangga)
# menampilkan hasil all breaks
besttangga
#berdasarkan rmse
besttangga %>% slice_min(rmse)
#berdasarkan mae
besttangga %>% slice_min(mae)
Spline regression
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
bestspline <- map_dfr(crossval$splits,
function(x){
mod <- lm(horsepower ~ bs(mpg,knots=knotvalue),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
## Warning in bs(mpg, degree = 3L, knots = c(`25%` = 17, `50%` = 22.75, `75%` = 29:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(mpg, degree = 3L, knots = c(`25%` = 17, `50%` = 22.75, `75%` = 29:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
bestspline <- colMeans(bestspline)
bestspline
## rmse mae
## 19.39520 14.07474
Natural Spline Regression
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
bestnatural <- map_dfr(crossval$splits,
function(x){
mod <- lm(horsepower ~ ns(mpg,knots=knotvalue),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
bestnatural <- colMeans(bestnatural)
bestnatural
## rmse mae
## 19.41492 14.15844
LOESS
set.seed(777)
crossval <- vfold_cv(AutoData,v=10,strata = "horsepower")
span <- seq(0.1,1,length.out=50)
bestloess <- map_dfr(span, function(i){
metricloess <- map_dfr(crossval$splits,
function(x){
mod <- loess(horsepower ~mpg,span = i,
data=AutoData[x$in_id,])
pred <- predict(mod,
newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower
data_eval <- na.omit(data.frame(pred=pred,
truth=truth))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
head(metricloess,20)
# menghitung rata-rata untuk 10 folds
meanmetricloess <- colMeans(metricloess)
meanmetricloess
}
)
bestloess <- cbind(span=span,bestloess)
# menampilkan hasil all breaks
head(bestloess)
#berdasarkan rmse
bestloess %>% slice_min(rmse)
Perbandingan Model
nilai_metric <- rbind(meanmetriclinear,
meanmetricpoly,
meanmetricpoly3,
besttangga[4,2:3],
bestspline,
bestnatural,
bestloess[27,2:3])
rownames(nilai_metric) <- c("Linear","Polinomial ordo=2","Polinomial ordo=3","Tangga breaks=10","Spline Regression", "Natural Spline Regression", "LOESS")
knitr::kable(nilai_metric)
| rmse | mae | |
|---|---|---|
| Linear | 24.04699 | 18.43851 |
| Polinomial ordo=2 | 20.35446 | 14.88426 |
| Polinomial ordo=3 | 19.48501 | 14.29354 |
| Tangga breaks=10 | 20.08860 | 14.34280 |
| Spline Regression | 19.39520 | 14.07474 |
| Natural Spline Regression | 19.41492 | 14.15844 |
| LOESS | 19.28214 | 13.77604 |
Simpulan
Berdasarkan hasil Cross-Validation 10 folds, model LOESS merupakan model terbaik untuk hubungan data horsepower~mpg karena memiliki nilai rmse dan mae yang paling kecil dibanding model lainnya.