Package yang digunakan
library(ISLR) # Data Auto
library(ggplot2) # Visualisasi data
library(DT) # Tabel
library(rsample) # cross validation
library(tidyverse) # fs mapdfr
library(mlr3measures) # perhitungan rmse dan mae
library(splines) # spline
library(gridExtra) # Grafik
library(grid) # GrafikData
Data yang digunakan adalah data Auto pada package ISLR. Data Auto adalah sebuah data frame dengan 392 observasi dan 9 variabel. Berikut adalah data Auto :
Selanjutnya, hanya akan digunakan 2 variabel yaitu mpg (miles per gallon) dan horse power untuk melihat hubungan dari kedua variabel tersebut.
Visualisasi Data
plot(horsepower, mpg, pch=19, col="coral",
xlab="Horse Power", ylab="Miles per Gallon")Terlihat data Auto tidak mengikuti pola linier. Sehingga jika dipaksakan menggunakan model linier akan menimbulkan beberapa masalah, terutama dalam hal prediksi.
Data Auto dengan Model Linier
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "red",se = F)+
theme_bw()Sehingga untuk mengatasi data non linier dapat digunakan beberapa model seperti :
- Regresi Polinomial
- Piecewise Constant (Fungsi Tangga)
- Natural Cubic Splines
- dll
Untuk masing-masing model akan dicari parameter yang optimum dengan cross validation
Regresi Polinomial
Akan dicari pangkat yang optimal untuk regresi polinomial dengan Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
order <- 2:4
best_poly <- map_dfr(order, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(order,best_poly)
# menampilkan hasil all breaks
datatable(best_poly)#berdasarkan rmse
datatable(best_poly %>% slice_min(rmse))Piecewise Constant
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:20
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
datatable(best_tangga)#berdasarkan rmse
datatable(best_tangga %>% slice_min(rmse))Natural Cubic Splines
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:20
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)#berdasarkan rmse
datatable(best_spline3 %>% slice_min(rmse))Model Terbaik
Dari ketiga model yang sudah ditentukan parameter optimalnya, akan ditentukan mana yang terbaik bagi data Auto yang berdasarkan pada nilai RMSE. Model terbaik adalah model dengan RMSE terendah.
nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
best_tangga %>% select(-1) %>% slice_min(rmse),
best_spline3 %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c(" Reg Polinomial (degree=2) ",
"Piecewise Constant (knot=15)",
"Natural Cubic Splines (df=10)"
)
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)Sehingga, model yang memiliki RMSE terendah adalah Model Natural Cubic Splines dengan df=10.
#berdasarkan rmse
datatable(best_model %>% slice_min(rmse))Model Final - Data Auto
Mencari letak knot dengan df=10
attr(ns(Auto$horsepower,df=10), "knots")## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,se = F) +
ggtitle("Natural Cubic Splines (df=10) - Data Auto")regspline = lm(mpg ~ ns(horsepower, knots=c(67,72,80,88,93.5,100,110,140,157.7)))
prediksi = predict(regspline , data.frame(horsepower), interval="predict" )
plot(horsepower, mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="Natural Cubic Splines dengan df=10 - Data Auto")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(67,72,80,88,93.5,100,110,140,157.7), col="grey50", lty=2)Subsetting Data
Data Auto akan dibagi berdasarkan asal negara produsen yang ada pada kolom origin. Dengan pembagian :
Kode 1 : Amerika Kode 2 : Eropa Kode 3 : Jepang
Berikut adalah pembagian masing-masing subset data :
dataAmerika <- Auto %>% filter(origin ==1)
dataEropa <- Auto %>% filter(origin ==2)
dataJepang <- Auto %>% filter(origin ==3)Untuk masing-masing subset data akan dipilih model terbaik diantara Model regresi polinomial, Piecewise Constant, dan Natural Cubic Spline dengan parameter optimumnya.
Data Amerika
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")
order <- 2:4
best_poly_amerika <- map_dfr(order, function(i){
metric_poly_amerika <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i),
data=dataAmerika[x$in_id,])
pred <- predict(mod,
newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_poly_amerika <- colMeans(metric_poly_amerika)
mean_metric_poly_amerika
}
)
best_poly_amerika <- cbind(order,best_poly_amerika)
# menampilkan hasil all breaks
#berdasarkan rmse
datatable(best_poly_amerika %>% slice_min(rmse))Piecewise Constant
set.seed(123)
cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")
breaks <- 2:20
best_tangga_amerika <- map_dfr(breaks, function(i){
metric_tangga_amerika <- map_dfr(cross_val$splits,
function(x){
training <- dataAmerika[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataAmerika[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_amerika <- colMeans(metric_tangga_amerika)
mean_metric_tangga_amerika
})
best_tangga_amerika <- cbind(breaks=breaks,best_tangga_amerika)
# menampilkan hasil all breaks
#berdasarkan rmse
datatable(best_tangga_amerika %>% slice_min(rmse))Natural Cubic Spline
set.seed(123)
cross_val <- vfold_cv(dataAmerika,v=10,strata = "mpg")
df <- 2:10
best_spline_amerika <- map_dfr(df, function(i){
metric_spline_amerika <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=dataAmerika[x$in_id,])
pred <- predict(mod,
newdata=dataAmerika[-x$in_id,])
truth <- dataAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_spline_amerika <- colMeans(metric_spline_amerika)
mean_metric_spline_amerika
}
)
best_spline_amerika <- cbind(df=df,best_spline_amerika)
# menampilkan hasil all breaks
#berdasarkan rmse
datatable(best_spline_amerika %>% slice_min(rmse))Model Terbaik - Data Amerika
Berikut adalah model-model terbaik dari hasil CV untuk data Amerika
ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(aes(colour="Polinomial d=2"),method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1,se = F)+
stat_smooth(aes(colour="Fungsi Tangga k=11"),method = "lm",
formula = y~cut(x,11),
lty = 1,se = F)+
stat_smooth(aes(colour="Natural Cubic Spline df=7"),method = "lm",
formula = y~ns(x, df=7),
lty = 1,se = F) +
theme_bw() +
labs(title="Perbandingan Model Terbaik - Data Amerika") +
ylab("Miles per Gallon") +
xlab("Horsepower") nilai_metric_amerika <- rbind(best_poly_amerika %>% select(-1) %>% slice_min(rmse),
best_tangga_amerika %>% select(-1) %>% slice_min(rmse),
best_spline_amerika %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Reg Polinomial(degree=2)",
"Piecewise Constant (knot=11)",
"Natural Cubic Splines(df=7)"
)
best_model_amerika <- data.frame(nama_model,nilai_metric_amerika)
datatable(best_model_amerika)#berdasarkan rmse
datatable(best_model_amerika %>% slice_min(rmse))Dari output diatas dapat diambil kesimpulan bahwa :
Jika menggunakan model polinomial, degree optimal ada pada degree=2
Jika menggunakan model piecewise constant, knot optimal ada pada knot=11
Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=7
Dan model terbaik dari ketiganya untuk data Amerika adalah dengan Model Piecewise Constant dengan knot 11
Model Final - Data Amerika
ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw() +
ggtitle("Piecewise Constant (11 Knot) - Data Amerika") +
ylab("Miles per Gallon") +
xlab("Horsepower") +
theme(plot.title = element_text(hjust = 0.5))Data Eropa
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
order <- 2:4
best_poly_eropa <- map_dfr(order, function(i){
metric_poly_eropa <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i),
data=dataEropa[x$in_id,])
pred <- predict(mod,
newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_poly_eropa <- colMeans(metric_poly_eropa)
mean_metric_poly_eropa
}
)
best_poly_eropa <- cbind(order,best_poly_eropa)
#berdasarkan rmse
datatable(best_poly_eropa %>% slice_min(rmse))Piecewise Constant
set.seed(123)
cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:9
best_tangga_eropa <- map_dfr(breaks, function(i){
metric_tangga_eropa <- map_dfr(cross_val$splits,
function(x){
training <- dataEropa[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataEropa[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_eropa <- colMeans(metric_tangga_eropa)
mean_metric_tangga_eropa
})
best_tangga_eropa <- cbind(breaks=breaks,best_tangga_eropa)
#berdasarkan rmse
datatable(best_tangga_eropa %>% slice_min(rmse))Natural Cubic Spline
set.seed(123)
cross_val <- vfold_cv(dataEropa,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:10
best_spline_eropa <- map_dfr(df, function(i){
metric_spline_eropa <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=dataEropa[x$in_id,])
pred <- predict(mod,
newdata=dataEropa[-x$in_id,])
truth <- dataEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_spline_eropa <- colMeans(metric_spline_eropa)
mean_metric_spline_eropa
}
)
best_spline_eropa <- cbind(df=df,best_spline_eropa)
#berdasarkan rmse
datatable(best_spline_eropa %>% slice_min(rmse))Model Terbaik - Data Eropa
Berikut adalah model-model terbaik dari hasil CV untuk data Eropa
ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(aes(colour="Polinomial d=2"),method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1,se = F)+
stat_smooth(aes(colour="Fungsi Tangga k=7"),method = "lm",
formula = y~cut(x,7),
lty = 1,se = F)+
stat_smooth(aes(colour="Natural Cubic Spline df=2"),method = "lm",
formula = y~ns(x, df=2),
lty = 1,se = F) +
theme_bw() +
labs(title="Perbandingan Model Terbaik - Data Eropa") +
ylab("Miles per Gallon") +
xlab("Horsepower") nilai_metric_eropa <- rbind(best_poly_eropa %>% select(-1) %>% slice_min(rmse),
best_tangga_eropa %>% select(-1) %>% slice_min(rmse),
best_spline_eropa %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Reg Polinomial (degree=2)",
"Pieceswise Constant (knot=7)",
"Natural Cubic Splines (df=2)"
)
best_model_eropa <- data.frame(nama_model,nilai_metric_eropa)
datatable(best_model_eropa)#berdasarkan rmse
datatable(best_model_eropa %>% slice_min(rmse))Dari output diatas dapat diambil kesimpulan bahwa :
Jika menggunakan model polinomial, degree optimal ada pada degree=2
Jika menggunakan model piecewise constant, knot optimal ada pada knot=7
Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=1
Dan model terbaik dari ketiganya untuk data Eropa adalah dengan Model Natural Cubic Splines dengan df 1
Model Final - Data Eropa
attr(ns(dataEropa$horsepower,df=2), "knots")## 50%
## 76.5
ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1,se = F) +
ggtitle("Natural Cubic Splines (df=2) - Data Eropa") +
ylab("Miles per Gallon") +
xlab("Horsepower") +
theme(plot.title = element_text(hjust = 0.5)) Data Jepang
Regresi Polinomial
set.seed(123)
cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
order <- 2:4
best_poly_jepang <- map_dfr(order, function(i){
metric_poly_jepang <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,i),
data=dataJepang[x$in_id,])
pred <- predict(mod,
newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_poly_jepang <- colMeans(metric_poly_jepang)
mean_metric_poly_jepang
}
)
best_poly_jepang <- cbind(order,best_poly_jepang)
#berdasarkan rmse
datatable(best_poly_jepang %>% slice_min(rmse))Piecewise Constant
set.seed(123)
cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:5
best_tangga_jepang <- map_dfr(breaks, function(i){
metric_tangga_jepang <- map_dfr(cross_val$splits,
function(x){
training <- dataJepang[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dataJepang[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_jepang <- colMeans(metric_tangga_jepang)
mean_metric_tangga_jepang
})
best_tangga_jepang <- cbind(breaks=breaks,best_tangga_jepang)
#berdasarkan rmse
datatable(best_tangga_jepang %>% slice_min(rmse))Natural Cubic Spline
set.seed(123)
cross_val <- vfold_cv(dataJepang,v=5,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:10
best_spline_jepang <- map_dfr(df, function(i){
metric_spline_jepang <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=dataJepang[x$in_id,])
pred <- predict(mod,
newdata=dataJepang[-x$in_id,])
truth <- dataJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_spline_jepang <- colMeans(metric_spline_jepang)
mean_metric_spline_jepang
}
)
best_spline_jepang <- cbind(df=df,best_spline_jepang)
#berdasarkan rmse
datatable(best_spline_jepang %>% slice_min(rmse))Model Terbaik - Data Jepang
Berikut adalah model-model terbaik dari hasil CV untuk data Amerika
ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(aes(colour="Polinomial d=3"),method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1,se = F)+
stat_smooth(aes(colour="Fungsi Tangga k=4"),method = "lm",
formula = y~cut(x,4),
lty = 1,se = F)+
stat_smooth(aes(colour="Natural Cubic Spline df=3"),method = "lm",
formula = y~ns(x, df=3),
lty = 1,se = F) +
theme_bw() +
labs(title="Perbandingan Model Terbaik - Jepang") +
ylab("Miles per Gallon") +
xlab("Horsepower") nilai_metric_jepang <- rbind(best_poly_jepang %>% select(-1) %>% slice_min(rmse),
best_tangga_jepang %>% select(-1) %>% slice_min(rmse),
best_spline_jepang %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Reg Polinomial (degree=3)",
"Piecewise Constant (knot=4)",
"Natural Cubic Splines (df=3)"
)
best_model_jepang <- data.frame(nama_model,nilai_metric_jepang)
datatable(best_model_jepang)#berdasarkan rmse
datatable(best_model_jepang %>% slice_min(rmse))Dari output diatas dapat diambil kesimpulan bahwa :
Jika menggunakan model polinomial, degree optimal ada pada degree=3
Jika menggunakan model piecewise constant, knot optimal ada pada knot=4
Jika menggunakan model Natural Cubic Splines, df optimal ada pada df=3
Dan model terbaik dari ketiganya untuk data Jepang adalah dengan Model Polynomial Regression dengan degree 3
Model Final - Data Jepang
ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "red",se = F)+
theme_bw() +
ggtitle("Regresi Polynomial (degree=3) - Data Jepang") +
ylab("Miles per Gallon") +
xlab("Horsepower") +
theme(plot.title = element_text(hjust = 0.5))Model Terbaik
Sehingga, jika di plot model terbaik untuk masing-masing data adalah :
#plot 1 : Auto
plot1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,se = F) +
ggtitle("Auto - Natural Cubic Splines (df=10)") +
ylab("Miles per Gallon") +
xlab("Horsepower")
#plot 2 : Amerika
plot2 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,11),
lty = 1, col = "blue",se = F)+
theme_bw() +
ggtitle("Amerika - Piecewise Constant (K=11)") +
ylab("Miles per Gallon") +
xlab("Horsepower")
#plot 3 : Eropa
plot3 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1,se = F) +
ggtitle("Eropa - Natural Cubic Splines (df=2)") +
ylab("Miles per Gallon") +
xlab("Horsepower")
#plot 4 : Jepang
plot4 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "red",se = F)+
theme_bw() +
ggtitle("Jepang - Polynomial Regression (d=3)") +
ylab("Miles per Gallon") +
xlab("Horsepower")
grid.arrange(plot1, plot2, plot3, plot4, nrow=2, ncol=2, top = textGrob("Model Terbaik",gp=gpar(fontsize=20,font=3)))