DATA
Data yang digunakan yaitu data Auto
. Data ini berisi tentang informasi mengenai 392 jenis kendaraan. Berikut adalah penjelasan untuk masing-masing variabel dalam data:
- mpg: miles per gallon
- cylinders: Number of cylinders between 4 and 8
- displacement: Engine displacement (cu. inches)
- horsepower: Engine horsepower
- weight: Vehicle weight (lbs.)
- acceleration: Time to accelerate from 0 to 60 mph (sec.)
- year: Model year (modulo 100)
- origin: Origin of car (1. American, 2. European, 3. Japanese)
- name: Vehicle name
Peubah respon yang digunakan yaitu mpg
(miles per gallon) sedangkan peubah penjelasnya yaitu horsepower
.
library(tidyverse)
library(ggplot2)
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(caret) #cross-validation
library(gridExtra) #combining graphs
library(ggplot2)
library(dplyr)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)
library(gam)
library(ggpubr)
dataauto = Auto %>% select(mpg, horsepower, origin)
head(dataauto)
## mpg horsepower origin
## 1 18 130 1
## 2 15 165 1
## 3 18 150 1
## 4 16 150 1
## 5 17 140 1
## 6 15 198 1
Pemodelan mpg vs horsepower
Plot mpg vs horsepower
ggplot(dataauto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
theme_bw()
Regresi Polinomial
set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")
derajat <- 1:5
polinomial <- map_dfr(derajat, function(i) {
metric_poli <- map_dfr(val_silang$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,derajat=i), data=Auto[x$in_id,])
predik <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_poli
mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
mean_metric_poli
}
)
polinomial<- cbind(derajat=derajat,polinomial)
Perbandingan RMSE dan MAE
datatable(polinomial)
Plot RMSE dan MAE
polinomial$derajat <- as.factor(polinomial$derajat)
prmse<-ggplot(data=polinomial, aes(x=derajat, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polinomial$derajat <- as.factor(polinomial$derajat)
pmae<-ggplot(data=polinomial, aes(x=derajat, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(prmse,pmae, labels=c("RMSE","MAE"), label_size = 6)
Plot Regresi Polinomial d=2 dan d=5
regpol2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("Horse Power") +
ylab("mile per gallon")
regpol5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,5,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 5") +
xlab("Horse Power") +
ylab("mile per gallon")
plot_grid(regpol2,regpol5)
Nilai RMSE dan MAE terendah terdapat pada regresi polinomial derajat 5, namun apabila dilihat dari plot regresi pada bagian ujung-ujung plot cenderung tidak teratur untuk regresi polinomial derajat 5. Oleh karena itu, regresi polinomial terbaik yaitu regresi polinomial derajat 2. Berikut merupakan summary dari model regresi polinomial derajat 2:
poli2 = lm(mpg ~ poly(horsepower,2), data = Auto)
summary(poli2)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Piecewise Constant
set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")
breakk <- 2:11 #knots 1-10
pemilihanf.tangga <- map_dfr(breakk, function(i){
metric_ftangga <- map_dfr(val_silang$splits,
function(x){
datatrain <- Auto[x$in_id,]
datatrain$horsepower <- cut(datatrain$horsepower,i)
modelt <- lm(mpg ~ horsepower, data=datatrain)
labs_x <- levels(modelt$model[,2])
labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
datatesting <- Auto[-x$in_id,]
horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
predik <- predict(modelt, newdata=list(horsepower=horsepower_new))
truth <- datatesting$mpg
evaluasi <- na.omit(data.frame(truth,predik))
rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanf.tangga <- cbind(breakk=breakk,pemilihanf.tangga)
Perbandingan RMSE dan MAE
datatable(pemilihanf.tangga)
Plot RMSE dan MAE
pemilihanf.tangga$breakk <- as.factor(pemilihanf.tangga$breakk)
trmse<-ggplot(data=pemilihanf.tangga, aes(x=breakk, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
pemilihanf.tangga$breakk <- as.factor(pemilihanf.tangga$breakk)
tmae<-ggplot(data=pemilihanf.tangga, aes(x=breakk, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(trmse,tmae)
Berdasarkan nilai RMSE model piecewise constant terbaik adalah piecewise constant dengan knots=7, sedangkan berdasarkan nilai MAE model terbaik adalah piecewise constant dengan knots=6.
Plot Piecewise Constant
ft7<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=6") +
xlab("Horse Power") + ylab("mile per gallon")
ft8<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=7") +
xlab("Horse Power") + ylab("mile per gallon")
plot_grid(ft7,ft8)
Apabila dilihat secara visual, plot dari piecewise constant dengan knots 7 memiliki kurva yang lebih mulus dibandingkan dengan knots 6, selain itu juga tidak terdapat pola yang tidak teratur pada plot dengan knots=7. Jadi, pada metode piecewise constant ini dipilih knots=7.
Berikut merupakan summary model piecewise constant dengan knots=6:
fungsitangga7 = lm(Auto$mpg ~ cut(Auto$horsepower,7)) #6 knots
summary(fungsitangga7)
##
## Call:
## lm(formula = Auto$mpg ~ cut(Auto$horsepower, 7))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5280 -2.5530 -0.3758 2.2881 16.7482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5280 0.5069 64.17 <2e-16 ***
## cut(Auto$horsepower, 7)(72.3,98.6] -6.8162 0.6358 -10.72 <2e-16 ***
## cut(Auto$horsepower, 7)(98.6,125] -12.1523 0.7591 -16.01 <2e-16 ***
## cut(Auto$horsepower, 7)(125,151] -16.5763 0.7957 -20.83 <2e-16 ***
## cut(Auto$horsepower, 7)(151,177] -18.5020 1.0831 -17.08 <2e-16 ***
## cut(Auto$horsepower, 7)(177,204] -19.4447 1.4187 -13.71 <2e-16 ***
## cut(Auto$horsepower, 7)(204,230] -19.6280 1.5375 -12.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared: 0.6594, Adjusted R-squared: 0.6541
## F-statistic: 124.2 on 6 and 385 DF, p-value: < 2.2e-16
Natural Cubic Splines
set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:6 #knots 1-5
pemilihan.ncs <- map_dfr(df, function(i) {
metric_ncs<- map_dfr(val_silang$splits,
function(x) {
modelns <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
predik <- predict(modelns, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ncs
# menghitung rata-rata untuk 10 folds
mean_metric_ncs <- colMeans(metric_ncs)
mean_metric_ncs
}
)
pemilihan.ncs <- cbind(df=df,pemilihan.ncs)
Perbandingan RMSE dan MAE
datatable(pemilihan.ncs)
Plot RMSE dan MAE
pemilihan.ncs$df <- as.factor(pemilihan.ncs$df)
nrmse<-ggplot(data=pemilihan.ncs, aes(x=df, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
nmae<-ggplot(data=pemilihan.ncs, aes(x=df, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(nrmse,nmae)
Berdasarkan hasil di atas dapat disimpulkan bahwa Model Natural Cubic Splines terbaik adalah dengan df=6. Berikut merupakan knots dalam Natural Cubic Splines (df=6):
Knots
attr(ns(Auto$horsepower, df=6),"knots")
## 16.66667% 33.33333% 50% 66.66667% 83.33333%
## 70.0 84.0 93.5 110.0 150.0
Plot Natural Cubic Splines
plotncs <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots=c(70.0, 84.0, 93.5, 110.0,150.0)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines with df=6") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(70.0, 84.0, 93.5, 110.0,150.0), col="grey50", lty=2)
plotncs
ncspline = lm(Auto$mpg ~ ns(Auto$horsepower, knots = c(70.0, 84.0, 93.5, 110.0,150.0)))
summary(ncspline)
##
## Call:
## lm(formula = Auto$mpg ~ ns(Auto$horsepower, knots = c(70, 84,
## 93.5, 110, 150)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9491 -2.6183 -0.1595 2.3508 15.1349
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 34.738 1.509
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))1 -8.210 1.594
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))2 -13.046 1.835
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))3 -14.577 1.886
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))4 -22.802 1.624
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))5 -22.758 3.512
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))6 -20.849 1.742
## t value Pr(>|t|)
## (Intercept) 23.021 < 2e-16 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))1 -5.149 4.18e-07 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))2 -7.108 5.76e-12 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))3 -7.730 9.50e-14 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))4 -14.039 < 2e-16 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))5 -6.480 2.81e-10 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))6 -11.967 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.302 on 385 degrees of freedom
## Multiple R-squared: 0.7009, Adjusted R-squared: 0.6962
## F-statistic: 150.4 on 6 and 385 DF, p-value: < 2.2e-16
Perbandingan Model
nilai_rmse <- rbind(4.32756231703042,
4.41347139979508,
4.2917765537155)
nilai_mae<-rbind(3.2639553356593,
3.41081122407494,
3.24670654206857)
Metode <- c("Regresi Polinomial (d=2)",
"Piecewise Constant (knots=7)",
"Natural Cubic Splines (df=6)")
perbandingan <- data.frame(Metode,nilai_rmse, nilai_mae)
datatable(perbandingan)
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=6),
lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model terbaik untuk data Auto dengan peubah respon mpg
dan peubah penjelas horsepower
adalah Natural Cubic Splines dengan df=6.
Pemodelan mpg vs horsepower (Amerika)
Data
dataAmerika = Auto[Auto$origin==1,] %>% select(mpg, horsepower, origin)
datatable(dataAmerika)
ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="navy") +
theme_bw() +
labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
subtitle = "Amerika")
Regresi Polinomial
set.seed(018)
val_silang <- vfold_cv(data = dataAmerika,v=10)
derajat <- 1:5
polinomial1 <- map_dfr(derajat, function(i) {
metric_poli <- map_dfr(val_silang$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataAmerika[x$in_id,])
predik <- predict(mod, newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_poli
mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
mean_metric_poli
}
)
polinomial1<- cbind(derajat=derajat,polinomial1)
Perbandingan RMSE dan MAE untuk setiap derajat Regresi Polinomial
datatable(polinomial1)
Plot RMSE dan MAE
polinomial1$derajat <- as.factor(polinomial1$derajat)
prmse1<-ggplot(data=polinomial1, aes(x=derajat, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polinomial1$derajat <- as.factor(polinomial1$derajat)
pmae1<-ggplot(data=polinomial1, aes(x=derajat, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(prmse1,pmae1)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa regresi polinomial terbaik adalah dengan derajat 2.
Plot Regresi Polinomial Derajat 2
regpol12 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("Horse Power") +
ylab("mile per gallon")
regpol12
poli12 = lm(mpg ~ poly(horsepower,2), data = dataAmerika)
summary(poli12)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = dataAmerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9497 -2.5745 -0.0895 2.1583 13.1866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.0335 0.2442 82.04 < 2e-16 ***
## poly(horsepower, 2)1 -75.6095 3.8223 -19.78 < 2e-16 ***
## poly(horsepower, 2)2 29.4684 3.8223 7.71 3.26e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.822 on 242 degrees of freedom
## Multiple R-squared: 0.6507, Adjusted R-squared: 0.6478
## F-statistic: 225.4 on 2 and 242 DF, p-value: < 2.2e-16
Piecewise Constant
set.seed(018)
val_silang <- vfold_cv(dataAmerika,v=10,strata = "mpg")
breakk <- 2:11 #knots 1-10
pemilihanf.tangga1 <- map_dfr(breakk, function(i){
metric_ftangga <- map_dfr(val_silang$splits,
function(x){
datatrain <- dataAmerika[x$in_id,]
datatrain$horsepower <- cut(datatrain$horsepower,i)
modelt <- lm(mpg ~ horsepower, data=datatrain)
labs_x <- levels(modelt$model[,2])
labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
datatesting <- dataAmerika[-x$in_id,]
horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
predik <- predict(modelt, newdata=list(horsepower=horsepower_new))
truth <- datatesting$mpg
evaluasi <- na.omit(data.frame(truth,predik))
rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanfs.tangga1 <- cbind(breakk=breakk,pemilihanf.tangga1)
Perbandingan RMSE dan MAE
datatable(pemilihanfs.tangga1)
Plot RMSE dan MAE
pemilihanfs.tangga1$breakk <- as.factor(pemilihanfs.tangga1$breakk)
trmse1<-ggplot(data=pemilihanfs.tangga1, aes(x=breakk, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
pemilihanfs.tangga1$breakk <- as.factor(pemilihanfs.tangga1$breakk)
tmae1<-ggplot(data=pemilihanfs.tangga1, aes(x=breakk, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(trmse1,tmae1)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model piecewise constant terbaik adalah dengan knots=8.
Plot Piecewise Constant
ft9<- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=8") +
xlab("Horse Power") + ylab("mile per gallon")
ft9
fungsitangga9 = lm(dataAmerika$mpg ~ cut(dataAmerika$horsepower,9))
summary(fungsitangga9)
##
## Call:
## lm(formula = dataAmerika$mpg ~ cut(dataAmerika$horsepower, 9))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.946 -1.946 -0.375 2.077 13.054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5933 0.9521 34.23 < 2e-16
## cut(dataAmerika$horsepower, 9)(71.8,91.6] -7.6477 1.0519 -7.27 5.24e-12
## cut(dataAmerika$horsepower, 9)(91.6,111] -12.6916 1.0682 -11.88 < 2e-16
## cut(dataAmerika$horsepower, 9)(111,131] -13.8267 1.3465 -10.27 < 2e-16
## cut(dataAmerika$horsepower, 9)(131,151] -17.1706 1.1025 -15.57 < 2e-16
## cut(dataAmerika$horsepower, 9)(151,171] -18.3933 1.2892 -14.27 < 2e-16
## cut(dataAmerika$horsepower, 9)(171,190] -18.9010 1.3973 -13.53 < 2e-16
## cut(dataAmerika$horsepower, 9)(190,210] -21.2600 1.7813 -11.94 < 2e-16
## cut(dataAmerika$horsepower, 9)(210,230] -19.2183 1.6144 -11.90 < 2e-16
##
## (Intercept) ***
## cut(dataAmerika$horsepower, 9)(71.8,91.6] ***
## cut(dataAmerika$horsepower, 9)(91.6,111] ***
## cut(dataAmerika$horsepower, 9)(111,131] ***
## cut(dataAmerika$horsepower, 9)(131,151] ***
## cut(dataAmerika$horsepower, 9)(151,171] ***
## cut(dataAmerika$horsepower, 9)(171,190] ***
## cut(dataAmerika$horsepower, 9)(190,210] ***
## cut(dataAmerika$horsepower, 9)(210,230] ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.688 on 236 degrees of freedom
## Multiple R-squared: 0.6829, Adjusted R-squared: 0.6722
## F-statistic: 63.53 on 8 and 236 DF, p-value: < 2.2e-16
Natural Cubic Splines
set.seed(018)
val_silang <- vfold_cv(dataAmerika,v=10,strata = "mpg")
df <- 2:6
pemilihan.ncs1 <- map_dfr(df, function(i) {
metric_ncs<- map_dfr(val_silang$splits,
function(x) {
modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataAmerika[x$in_id,])
predik <- predict(modelns, newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ncs
# menghitung rata-rata untuk 10 folds
mean_metric_ncs <- colMeans(metric_ncs)
mean_metric_ncs
}
)
pemilihan.ncs1 <- cbind(df=df,pemilihan.ncs1)
Perbandingan RMSE dan MAE
datatable(pemilihan.ncs1)
Plot RMSE
pemilihan.ncs1$df <- as.factor(pemilihan.ncs1$df)
nrmse1<-ggplot(data=pemilihan.ncs1, aes(x=df, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
pemilihan.ncs1$df <- as.factor(pemilihan.ncs1$df)
nmae1<-ggplot(data=pemilihan.ncs1, aes(x=df, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(nrmse1,nmae1)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model natural cubic splines terbaik adalah dengan df=5.
Knots
attr(ns(dataAmerika$horsepower, df=5),"knots")
## 20% 40% 60% 80%
## 85.0 99.2 122.0 150.0
Plot Natural Cubic Splines
plotncs1 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(72,88,100,140)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines with df=5") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(72,88,100,140), col="grey50", lty=2)
plotncs1
ncspline51 = lm(dataAmerika$mpg ~ ns(dataAmerika$horsepower, knots = c(72,88,100,140)))
summary(ncspline51)
##
## Call:
## lm(formula = dataAmerika$mpg ~ ns(dataAmerika$horsepower, knots = c(72,
## 88, 100, 140)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.933 -2.229 -0.347 2.029 13.181
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 32.702 2.898
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))1 -8.755 2.767
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))2 -13.890 3.190
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))3 -18.333 1.898
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))4 -22.082 6.399
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))5 -18.666 1.756
## t value Pr(>|t|)
## (Intercept) 11.285 < 2e-16 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))1 -3.164 0.00176 **
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))2 -4.353 1.99e-05 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))3 -9.660 < 2e-16 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))4 -3.451 0.00066 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))5 -10.630 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.814 on 239 degrees of freedom
## Multiple R-squared: 0.6565, Adjusted R-squared: 0.6493
## F-statistic: 91.35 on 5 and 239 DF, p-value: < 2.2e-16
Perbandingan Model
nilai_rmse <- rbind(polinomial1 %>% select(-1) %>% slice_min(RMSE),
pemilihanfs.tangga1 %>% select(-1) %>% slice_min(RMSE),
pemilihan.ncs1 %>% select(-1)%>% slice_min(RMSE)
)
Metode <- c("Regresi Polinomial (d=2)",
"Piecewise Constant (knots=8)",
"Natural Cubic Splines (df=5)")
perbandingan1 <- data.frame(Metode,nilai_rmse)
datatable(perbandingan1)
ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=5),
lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
scale_color_manual(values = c("Regresi Polynomial"="coral","Piecewise Constant"="navy","Natural Cubic Spline"="green"))+theme_bw()
Berdasarkan nilai RMSE dapat disimpulkan bahwa model terbaik untuk memodelkan Data Auto dengan peubah respon mpg
dan peubah penjelas horsepower
dari kendaraan yang berasal dari Amerika adalah model Piecewise Constant dengan knots =6, sedangkan berdasarkan nilai MAE model terbaiknya adalah Natural Cubic Splines dengan df=6.
Pemodelan mpg vs horsepower (Eropa)
Data
dataEropa = Auto[Auto$origin==2,] %>% select(mpg, horsepower, origin)
datatable(dataEropa)
ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
theme_bw() +
labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
subtitle = "Eropa")
Regresi Polinomial
set.seed(018)
val_silang <- vfold_cv(data = dataEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
derajat <- 1:5
polinomial2 <- map_dfr(derajat, function(i) {
metric_poli <- map_dfr(val_silang$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataEropa[x$in_id,])
predik <- predict(mod, newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_poli
mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
mean_metric_poli
}
)
polinomial2<- cbind(derajat=derajat,polinomial2)
Perbandingan RMSE dan MAE
datatable(polinomial2)
Plot RMSE dan MAE
polinomial2$derajat <- as.factor(polinomial2$derajat)
nrmse2<-ggplot(data=polinomial2, aes(x=derajat, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polinomial2$derajat <- as.factor(polinomial2$derajat)
nmae2<-ggplot(data=polinomial2, aes(x=derajat, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(nrmse2,nmae2)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model Regresi Polinomial terbaik adalah Regresi Polinomial derajat 1.
Plot Regresi Polinomial derajat 1
regpol12 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial d=1") +
xlab("Horse Power") +
ylab("mile per gallon")
regpol12
poli1 = lm(mpg ~ poly(horsepower,1), data = dataEropa)
summary(poli1)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 1), data = dataEropa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4946 -2.8387 -0.4171 2.2148 12.8858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.6029 0.5898 46.800 < 2e-16 ***
## poly(horsepower, 1) -36.6027 4.8637 -7.526 1.87e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.864 on 66 degrees of freedom
## Multiple R-squared: 0.4618, Adjusted R-squared: 0.4537
## F-statistic: 56.64 on 1 and 66 DF, p-value: 1.869e-10
Piecewise Constant
set.seed(018)
val_silang <- vfold_cv(dataEropa,v=10, strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breakk <- 2:9 #knots 1-8
pemilihanf.tangga2 <- map_dfr(breakk, function(i){
metric_ftangga <- map_dfr(val_silang$splits,
function(x){
datatrain2 <- dataEropa[x$in_id,]
datatrain2$horsepower <- cut(datatrain2$horsepower,i)
modelt <- lm(mpg ~ horsepower, data=datatrain2)
labs_x <- levels(modelt$model[,2])
labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
datatesting2 <- dataEropa[-x$in_id,]
horsepower_new <- cut(datatesting2$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
predik <- predict(modelt, newdata=list(horsepower=horsepower_new))
truth <- datatesting2$mpg
evaluasi <- na.omit(data.frame(truth,predik))
rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanfs.tangga2 <- cbind(breakk=breakk,pemilihanf.tangga2)
Perbandingan RMSE dan MAE
datatable(pemilihanfs.tangga2)
Plot RMSE dan MAE
pemilihanfs.tangga2$breakk <- as.factor(pemilihanfs.tangga2$breakk)
trmse2<-ggplot(data=pemilihanfs.tangga2, aes(x=breakk, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
pemilihanfs.tangga2$breakk <- as.factor(pemilihanfs.tangga2$breakk)
tmae2<-ggplot(data=pemilihanfs.tangga2, aes(x=breakk, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(trmse2,tmae2)
Berdasarkan nilai RMSE model piecewise constant terbaik adalah piecewise constant dengan knots=1, sedangkan berdasarkan nilai MAE model terbaik adalah piecewise constant dengan knots=6.
Plot Piecewise Constant
ft12<- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,2),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=1") +
xlab("Horse Power") + ylab("mile per gallon")
ft62<- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=6") +
xlab("Horse Power") + ylab("mile per gallon")
plot_grid(ft12,ft62)
Apabila dilihat secara visual, plot dari piecewise constant dengan knots 6 memiliki kurva yang lebih mulus dibandingkan dengan knots 1, selain itu juga tidak terdapat pola yang tidak teratur pada plot dengan knots=6. Jadi, pada metode piecewise constant ini dipilih knots=6.
fungsitangga12 = lm(dataEropa$mpg ~ cut(dataEropa$horsepower,2))
summary(fungsitangga12)
##
## Call:
## lm(formula = dataEropa$mpg ~ cut(dataEropa$horsepower, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8755 -3.8755 -0.8755 2.4745 14.4245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.8755 0.7855 38.035 < 2e-16 ***
## cut(dataEropa$horsepower, 2)(89.5,133] -8.1334 1.4860 -5.473 7.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.498 on 66 degrees of freedom
## Multiple R-squared: 0.3122, Adjusted R-squared: 0.3018
## F-statistic: 29.96 on 1 and 66 DF, p-value: 7.352e-07
Natural Cubic Splines
set.seed(018)
val_silang <- vfold_cv(dataEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:6
pemilihan.ncs2<- map_dfr(df, function(i) {
metric_ncs<- map_dfr(val_silang$splits,
function(x) {
modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataEropa[x$in_id,])
predik <- predict(modelns, newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ncs
# menghitung rata-rata untuk 10 folds
mean_metric_ncs <- colMeans(metric_ncs)
mean_metric_ncs
}
)
pemilihan.ncs2 <- cbind(df=df,pemilihan.ncs2)
Perbandingan RMSE dan MAE
datatable(pemilihan.ncs2)
Plot RMSE dan MAE
pemilihan.ncs2$df <- as.factor(pemilihan.ncs2$df)
nrmse2<-ggplot(data=pemilihan.ncs2, aes(x=df, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
pemilihan.ncs2$df <- as.factor(pemilihan.ncs2$df)
nmae2<-ggplot(data=pemilihan.ncs2, aes(x=df, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(nrmse2,nmae2)
Berdasarkan nilai RMSE dan MAE model natural cubic splines terbaik adalah natural cubic splines dengan df=2.
Knots
attr(ns(dataEropa$horsepower, df=2),"knots")
## 50%
## 76.5
Plot Natural Cubic Splines
plotncs2 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = 76.5),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines with DF=2") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(76.5), col="grey50", lty=2)
plotncs2
ncspline22 = lm(dataEropa$mpg ~ ns(dataEropa$horsepower, knots = 76.5))
summary(ncspline22)
##
## Call:
## lm(formula = dataEropa$mpg ~ ns(dataEropa$horsepower, knots = 76.5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6482 -2.7674 -0.3836 2.2852 12.9801
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.550 1.739 20.443 < 2e-16
## ns(dataEropa$horsepower, knots = 76.5)1 -21.269 3.807 -5.587 4.90e-07
## ns(dataEropa$horsepower, knots = 76.5)2 -15.549 2.720 -5.717 2.95e-07
##
## (Intercept) ***
## ns(dataEropa$horsepower, knots = 76.5)1 ***
## ns(dataEropa$horsepower, knots = 76.5)2 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.899 on 65 degrees of freedom
## Multiple R-squared: 0.4622, Adjusted R-squared: 0.4457
## F-statistic: 27.93 on 2 and 65 DF, p-value: 1.756e-09
Perbandingan Model
nilai_rmse2 <- rbind(polinomial2 %>% select(-1) %>% slice_min(RMSE),
pemilihanfs.tangga2 %>% select(-1) %>% slice_min(MAE),
pemilihan.ncs2%>% select(-1)%>% slice_min(RMSE)
)
Metode <- c("Regresi Polinomial (d=1)",
"Piecewise Constant (Knots=6)",
"Natural Cubic Splines (df=2)")
perbandingan2 <- data.frame(Metode,nilai_rmse2)
datatable(perbandingan2)
ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()
Berdasarkan nilai RMSE dan MAE model terbaik untuk memodelkan data Auto dengan peubah respon mpg
dan peubah penjelas horsepower
dari kendaraan yang berasal dari Eropa adalah model Regresi polinomial derajat 1.
Pemodelan mpg vs horsepower (Jepang)
Data
dataJepang = Auto[Auto$origin==3,] %>% select(mpg, horsepower, origin)
datatable(dataJepang)
ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
theme_bw() +
labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
subtitle = "Jepang")
Regresi Polinomial
set.seed(018)
val_silang <- vfold_cv(data = dataJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
derajat <- 1:5
polinomial3 <- map_dfr(derajat, function(i) {
metric_poli <- map_dfr(val_silang$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataJepang[x$in_id,])
predik <- predict(mod, newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_poli
mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
mean_metric_poli
}
)
polinomial3<- cbind(derajat=derajat,polinomial3)
Perbandingan RMSE dan MAE
datatable(polinomial3)
Plot RMSE dan MAE
polinomial3$derajat <- as.factor(polinomial3$derajat)
prmse3<-ggplot(data=polinomial3, aes(x=derajat, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
polinomial3$derajat <- as.factor(polinomial3$derajat)
pmae3<-ggplot(data=polinomial3, aes(x=derajat, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Mean Absolute Error")
plot_grid(prmse3,pmae3)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model regresi polinomial terbaik adalah regresi poilinomial derajat 3.
Plot Regresi Polinomial derajat 3
regpol33 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial d=3") +
xlab("Horse Power") +
ylab("mile per gallon")
regpol33
poli33 = lm(mpg ~ poly(horsepower,3), data = dataJepang)
summary(poli33)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = dataJepang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8602 -2.7115 -0.5224 2.1985 11.6985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.4506 0.4536 67.138 < 2e-16 ***
## poly(horsepower, 3)1 -36.2030 4.0312 -8.981 1.62e-13 ***
## poly(horsepower, 3)2 9.1966 4.0312 2.281 0.0254 *
## poly(horsepower, 3)3 16.6991 4.0312 4.142 8.92e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.031 on 75 degrees of freedom
## Multiple R-squared: 0.5787, Adjusted R-squared: 0.5618
## F-statistic: 34.34 on 3 and 75 DF, p-value: 4.485e-14
Piecewise Constant
set.seed(018)
val_silang <- vfold_cv(dataJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breakk <- 2:5
pemilihanf.tangga3 <- map_dfr(breakk, function(i){
metric_ftangga <- map_dfr(val_silang$splits,
function(x){
datatrain <- dataJepang[x$in_id,]
datatrain$horsepower <- cut(datatrain$horsepower,i)
modelt <- lm(mpg ~ horsepower, data=datatrain)
labs_x <- levels(modelt$model[,2])
labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
datatesting <- dataJepang[-x$in_id,]
horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
predik <- predict(modelt, newdata=list(horsepower=horsepower_new))
truth <- datatesting$mpg
evaluasi <- na.omit(data.frame(truth,predik))
rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanfs.tangga3 <- cbind(breakk=breakk,pemilihanf.tangga3)
Perbandingan RMSE dan MAE
datatable(pemilihanfs.tangga3)
Plot RMSE dan MAE
pemilihanfs.tangga3$breakk <- as.factor(pemilihanfs.tangga3$breakk)
trmse3<-ggplot(data=pemilihanfs.tangga3, aes(x=breakk, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
pemilihanfs.tangga3$breakk <- as.factor(pemilihanfs.tangga3$breakk)
tmae3<-ggplot(data=pemilihanfs.tangga3, aes(x=breakk, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Mean Absolute Error")
plot_grid(trmse3,tmae3)
Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model Piecewise Constant terbaik adalah Piecewise Constant dengan knots=2.
Plot Fungsi Tangga
ft33<- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,3),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Knots=2") +
xlab("Horse Power") + ylab("mile per gallon")
ft33
fungsitangga33 = lm(dataJepang$mpg ~ cut(dataJepang$horsepower,3))
summary(fungsitangga33)
##
## Call:
## lm(formula = dataJepang$mpg ~ cut(dataJepang$horsepower, 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.08 -2.53 -1.08 1.97 12.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.0804 0.6381 53.409 < 2e-16
## cut(dataJepang$horsepower, 3)(78.7,105] -8.3360 1.0492 -7.945 1.40e-11
## cut(dataJepang$horsepower, 3)(105,132] -10.2804 1.8785 -5.473 5.47e-07
##
## (Intercept) ***
## cut(dataJepang$horsepower, 3)(78.7,105] ***
## cut(dataJepang$horsepower, 3)(105,132] ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.328 on 76 degrees of freedom
## Multiple R-squared: 0.508, Adjusted R-squared: 0.495
## F-statistic: 39.23 on 2 and 76 DF, p-value: 1.979e-12
Natural Cubic Splines
set.seed(018)
val_silang <- vfold_cv(dataJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:6
pemilihan.ncs3<- map_dfr(df, function(i) {
metric_ncs<- map_dfr(val_silang$splits,
function(x) {
modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataJepang[x$in_id,])
predik <- predict(modelns, newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = predik)
mae <- mlr3measures::mae(truth = truth, response = predik)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_ncs
# menghitung rata-rata untuk 10 folds
mean_metric_ncs <- colMeans(metric_ncs)
mean_metric_ncs
}
)
pemilihan.ncs3 <- cbind(df=df,pemilihan.ncs3)
Perbandingan RMSE dan MAE
datatable(pemilihan.ncs3)
Plot RMSE dan MAE
pemilihan.ncs3$df <- as.factor(pemilihan.ncs3$df)
nrmse3<-ggplot(data=pemilihan.ncs3, aes(x=df, y=RMSE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
pemilihan.ncs3$df <- as.factor(pemilihan.ncs3$df)
nmae3<-ggplot(data=pemilihan.ncs3, aes(x=df, y=MAE, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error")
plot_grid(nrmse3,nmae3)
Berdasarkan nilai RMSE dapat disimpulkan bahwa model natural cubic splines terbaik adalah natural cubic splines dengan df=3, sedangkan berdasarkan nilai MAE model terbaik adalah natural cubic splines dengan df=5.
Knots
attr(ns(dataJepang$horsepower, df=3),"knots")
## 33.33333% 66.66667%
## 68 90
attr(ns(dataJepang$horsepower, df=5),"knots")
## 20% 40% 60% 80%
## 65.0 69.2 88.0 96.0
Plot Natural Cubic Splines with df 3 and 5
plotncs3 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots=c(68,90)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines with DF=3") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(68,90), col="grey50", lty=2)
plotncs5 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(65.0, 69.2, 88.0, 96.0)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines with DF=5") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(65.0, 69.2, 88.0, 96.0), col="grey50", lty=2)
plot_grid(plotncs3,plotncs5)
ncspline33 = lm(dataJepang$mpg ~ ns(dataJepang$horsepower, knots=c(68,90)))
summary(ncspline33)
##
## Call:
## lm(formula = dataJepang$mpg ~ ns(dataJepang$horsepower, knots = c(68,
## 90)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2021 -2.8354 -0.5684 2.3362 11.6571
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 34.058 1.825 18.658
## ns(dataJepang$horsepower, knots = c(68, 90))1 -14.970 1.898 -7.885
## ns(dataJepang$horsepower, knots = c(68, 90))2 -7.386 4.354 -1.696
## ns(dataJepang$horsepower, knots = c(68, 90))3 -7.729 2.859 -2.703
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(dataJepang$horsepower, knots = c(68, 90))1 1.97e-11 ***
## ns(dataJepang$horsepower, knots = c(68, 90))2 0.0940 .
## ns(dataJepang$horsepower, knots = c(68, 90))3 0.0085 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.113 on 75 degrees of freedom
## Multiple R-squared: 0.5615, Adjusted R-squared: 0.544
## F-statistic: 32.02 on 3 and 75 DF, p-value: 1.975e-13
Perbandingan Model
nilai_rmse3 <- rbind(polinomial3 %>% select(-1) %>% slice_min(RMSE),
pemilihanfs.tangga3 %>% select(-1) %>% slice_min(RMSE),
pemilihan.ncs3%>% select(-1)%>% slice_min(RMSE)
)
Metode <- c("Regresi Polinomial (d=3)",
"Piecewise Constant (knots=2)",
"Natural Cubic Splines (df=3)")
perbandingan3 <- data.frame(Metode,nilai_rmse2)
datatable(perbandingan3)
ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,3),
lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=3),
lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()
Berdasarkan nilai RMSE dan MAE model terbaik untuk memodelkan data Auto dengan peubah respon mpg
dan peubah penjelas horsepower
dari kendaraan yang berasal dari Jepang adalah model Regresi polinomial derajat 3.
Visualisasi Model Terbaik
#Data Auto
plotncs6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, knots=c(70.0, 84.0, 93.5, 110.0,150.0)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines df=6") +
xlab("Horse Power") +
ylab("mile per gallon") +
labs(subtitle = "Data Auto")
geom_vline(xintercept = c(70.0, 84.0, 93.5, 110.0,150.0), col="grey50", lty=2)
## mapping: xintercept = ~xintercept
## geom_vline: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
#Data Amerika
ft9<- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant k=8") +
labs(subtitle = "Data Amerika")+
xlab("Horse Power") + ylab("mile per gallon")
plotncs5 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(72,88,100,140)),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines df=5") +
labs(subtitle = "Data Amerika")+
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(72,88,100,140), col="grey50", lty=2)
#Data Eropa
regpol1 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial d=1") +
labs(subtitle = "Data Eropa")+
xlab("Horse Power") +
ylab("mile per gallon")
#Data Jepang
regpol3 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial d=3") +
labs(subtitle = "Data Jepang")+
xlab("Horse Power") +
ylab("mile per gallon")
plot_grid(plotncs6, ft9, plotncs5, regpol1, regpol3)
Referensi
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
James, Gareth. et.al. (2013). An Introduction to Statistical Learning With Aplication in R. New York: Springer.
Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear