Digunakan model-model non-linier pada data Auto dengan membandingkan peubah respon mpg masing-masing terhadap tiga peubah prediktor yaitu horsepower, weight, dan acceleration. Pada proses di atas kemudian disajikan
A. Bukti hubungan non-linier pada masing-masing peubah yang didukung dengan visualisasi B. Model non-linear terbaik secara visual dan empiris
AutoBerikut ini sekilas gambaran dari dataset Auto dimana mpg sebagai peubah respon dibandingkan terhadap masing-masing peubah prediktor.
glimpse(Auto)## Rows: 392
## Columns: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2~
## $ cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, ~
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34~
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16~
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385~
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, ~
## $ year <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7~
## $ origin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, ~
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa~
Dengan fungsi pairs() terbukti bahwa peubah mpg memiliki hubungan non-linier dengan masing-masing peubah.
pairs(Auto[ ,-9])Dalam studi kasus ini fokus utama analisis digunakan peubah prediktor horsepower (engine horsepower), weight (vehicle weight), dan acceleration (time to accelerate from 0 to 60 mph (sec.)) terhadap mpg (miles per galon) yang masing-masing hubungannya terlihat pada scatterplot berikut.
c1 <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5) +
labs(title = "mpg vs horsepower")
c2 <- ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.5) +
labs(title = "mpg vs weight")
c3 <- ggplot(Auto, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.5) +
labs(title = "mpg vs acceleration")
grid.arrange(c1, c2, c3, ncol = 2)plotCVRMSE <- function(df, title, x, xlab) {
df %>% mutate(min_CV_RMSE = as.numeric(min(CV_RMSE) == CV_RMSE)) %>%
ggplot(aes_string(x = x, y = 'CV_RMSE')) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_RMSE))) +
theme(legend.position = "none") +
labs(title = title, x = xlab, y = "CV RMSE")
}
plotPred <- function(df, y, x, title) {
ggplot(df, aes_string(x = x, y = y, color = as.factor(1))) +
geom_point(alpha = 0.3, col = "black") +
geom_line(aes_string(x = x, y = "pred", color = as.factor(1)), size = 1, col = "blue") +
theme(legend.position = "none") +
labs(title = title)
}bestNonLinier <- function(data, respon, prediktor, model, tuning, cv.fold){
models <- list(poly=list(), step=list(), naturalsplines=list())
formulas <- list(poly=list(), step=list(), naturalsplines=list())
CV_RMSE <- list(poly=data.frame(degree=NULL, CV_RMSE=NULL), step=data.frame(step=NULL, CV_RMSE=NULL), naturalsplines=data.frame(fbasis=NULL, CV_RMSE=NULL))
bestModel <- list(poly=NULL, step=NULL, naturalsplines=NULL)
summaryTuning <- data.frame(model=NULL, parameter=NULL, RMSE=NULL)
summaryCV <- list()
summaryPred <- list()
for(m in seq(length(model))){
if(model[m] == "polynomial"){
for (i in seq(length(tuning[[1]]))) {
formulas$poly[[i]] <- paste0(respon, "~", "poly(", prediktor, ", degree=", tuning[[1]][[i]], ")")
models$poly[[i]] <- train(as.formula(formulas$poly[[i]]), data, method = "lm", trControl = dataLatih.5fold)
CV_RMSE$poly <- rbind(CV_RMSE$poly, data.frame(degree=tuning[[1]][[i]], CV_RMSE=models$poly[[i]]$results$RMSE))
}
# plot all RMSE
plotAllCV <- plotCVRMSE(CV_RMSE$poly, title="Polynomial Regression", x=colnames(CV_RMSE$poly)[1], xlab="Degree")
# save the best
bestPoly <- which.min(CV_RMSE$poly$CV_RMSE)
bestModel$poly$RMSE <- CV_RMSE$poly[bestPoly,]
bestModel$poly$model <- models$poly[[i]]
bestModel$poly$pred <- data.frame(data[c(respon, prediktor)], pred=predict(bestModel$poly$model, Auto, interval="prediction"))
bestModel$poly$plotAllCV <- plotAllCV
# plot best model prediction
pltPred <- plotPred(bestModel$poly$pred, y=respon, x=prediktor, title=paste0("Predicting '", respon, "' with ", bestModel$poly$RMSE$degree, " order"))
bestModel$poly$plotPred <- pltPred
summaryTuning <- rbind(summaryTuning, data.frame(model="polynomial", parameter=bestModel$poly$RMSE$degree, RMSE=bestModel$poly$RMSE$CV_RMSE))
summaryCV[[1]] <- bestModel$poly$plotAllCV
summaryPred[[1]] <- bestModel$poly$plotPred
}
if(model[m] == "step"){
for (i in seq(length(tuning[[2]]))) {
formulas$step[[i]] <- paste0(respon, "~", "cut(", prediktor, ", ", tuning[[2]][[i]], ")")
models$step[[i]] <- train(as.formula(formulas$step[[i]]), data, method = "lm", trControl = dataLatih.5fold)
CV_RMSE$step <- rbind(CV_RMSE$step, data.frame(cuts=tuning[[2]][[i]], CV_RMSE=models$step[[i]]$results$RMSE))
}
# plot all RMSE
plotAllCV <- plotCVRMSE(CV_RMSE$step, title="Piecewise constant", x=colnames(CV_RMSE$step)[1], xlab="Intervals")
# save the best
bestStep <- which.min(CV_RMSE$step$CV_RMSE)
bestModel$step$RMSE <- CV_RMSE$step[bestStep,]
bestModel$step$model <- models$step[[i]]
bestModel$step$pred <- data.frame(data[c(respon, prediktor)], pred=predict(bestModel$step$model, Auto, interval="prediction"))
bestModel$step$plotAllCV <- plotAllCV
# plot best model prediction
pltPred <- plotPred(bestModel$step$pred, y=respon, x=prediktor, title=paste0("Predicting '", respon, "' with a ", bestModel$step$RMSE$cuts, "-interval"))
bestModel$step$plotPred <- pltPred
summaryTuning <- rbind(summaryTuning, data.frame(model="piecewise", parameter=bestModel$step$RMSE$cuts, RMSE=bestModel$step$RMSE$CV_RMSE))
summaryCV[[2]] <- bestModel$step$plotAllCV
summaryPred[[2]] <- bestModel$step$plotPred
}
if(model[m] == "naturalsplines"){
for (i in seq(length(tuning[[3]]))) {
formulas$naturalsplines[[i]] <- paste0(respon, "~", "ns(", prediktor, ", df=", tuning[[3]][[i]], ")")
models$naturalsplines[[i]] <- train(as.formula(formulas$naturalsplines[[i]]), data, method = "lm", trControl = dataLatih.5fold)
CV_RMSE$naturalsplines <- rbind(CV_RMSE$naturalsplines, data.frame(df=tuning[[3]][[i]], CV_RMSE=models$naturalsplines[[i]]$results$RMSE))
}
# plot all RMSE
plotAllCV <- plotCVRMSE(CV_RMSE$naturalsplines, title="Natural Cubic Splines", x=colnames(CV_RMSE$naturalsplines)[1], xlab="Degrees of freedom")
# save the best
bestDF <- which.min(CV_RMSE$naturalsplines$CV_RMSE)
bestModel$naturalsplines$RMSE <- CV_RMSE$naturalsplines[bestDF,]
bestModel$naturalsplines$model <- models$naturalsplines[[i]]
bestModel$naturalsplines$pred <- data.frame(data[c(respon, prediktor)], pred=predict(bestModel$naturalsplines$model, Auto, interval="prediction"))
bestModel$naturalsplines$plotAllCV <- plotAllCV
# plot best model prediction
eval(parse(text=paste0("splineKnots <- attr(ns(data$", prediktor, ", df = bestDF), 'knots')")))
eval(parse(text=paste0("splineBoundary <- attr(ns(data$", prediktor, ", df = bestDF), 'Boundary.knots')")))
bestModel$naturalsplines$knots <- c(splineKnots, splineBoundary)
pltPred <- plotPred(bestModel$naturalsplines$pred, y=respon, x=prediktor, title=paste0("Predicting '", respon, "' with ", length(bestModel$naturalsplines$knots), " knots")) +
geom_vline(xintercept = bestModel$naturalsplines$knots, linetype = "dashed")
bestModel$naturalsplines$plotPred <- pltPred
summaryTuning <- rbind(summaryTuning, data.frame(model="naturalsplines", parameter=length(bestModel$naturalsplines$knots), RMSE=bestModel$naturalsplines$RMSE$CV_RMSE))
summaryCV[[3]] <- bestModel$naturalsplines$plotAllCV
summaryPred[[3]] <- bestModel$naturalsplines$plotPred
}
}
return(list(best=bestModel, tuning=summaryTuning, plotCV = summaryCV, plotPred = summaryPred))
}Pada simulasi kali ini, selain membanding respon terhadap prediktor disajikan pola perbedaan perlakuan terhadap validasi silang 5-fold dan 10-fold.
dataLatih.5fold <- trainControl(method='cv', number = 5)
dataLatih.10fold <- trainControl(method='cv', number = 10)Inisialisasi tuning parameter masing-masing diberikan untuk regresi polinomial, step function, dan natural cubic splines yaitu c(2:10), c(2:10), dan c(3:15).
data <- Auto
nonlinier_model <- c('polynomial', 'step', 'naturalsplines')
tuning <- list(d=c(2:10), c=c(2:10), k=c(3:15))mpg vs horsepowerset.seed(013)
mpgHorsepower5fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='horsepower', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.5fold)
mpgHorsepower10fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='horsepower', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.10fold)grid.arrange(mpgHorsepower5fold$plotCV[[1]], mpgHorsepower5fold$plotCV[[2]], mpgHorsepower5fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs horsepower (5-fold CV)",gp=gpar(fontsize=14,font=3)))mpgHorsepower5fold$tuningmodelComparison <- data.frame(Auto[c('mpg','horsepower')],
polynomial=mpgHorsepower5fold$best$poly$pred$pred,
step=mpgHorsepower5fold$best$step$pred$pred,
naturalsplines=mpgHorsepower5fold$best$naturalsplines$pred$pred)
mpgCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'horsepower', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'horsepower', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'horsepower', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'horsepower', y = "naturalsplines", color = as.factor(4)), size = 1) +
scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgHorsepower5fold$plotPred[[1]], mpgHorsepower5fold$plotPred[[2]], mpgHorsepower5fold$plotPred[[3]], mpgCompare, ncol = 2, top = textGrob("5-fold CV",gp=gpar(fontsize=14,font=3)))grid.arrange(mpgHorsepower10fold$plotCV[[1]], mpgHorsepower10fold$plotCV[[2]], mpgHorsepower10fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs horsepower (10-fold CV)",gp=gpar(fontsize=14,font=3)))mpgHorsepower10fold$tuningmodelComparison <- data.frame(Auto[c('mpg','horsepower')],
polynomial=mpgHorsepower10fold$best$poly$pred$pred,
step=mpgHorsepower10fold$best$step$pred$pred,
naturalsplines=mpgHorsepower10fold$best$naturalsplines$pred$pred)
mpgCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'horsepower', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'horsepower', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'horsepower', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'horsepower', y = "naturalsplines", color = as.factor(4)), size = 1) +
scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgHorsepower10fold$plotPred[[1]], mpgHorsepower10fold$plotPred[[2]], mpgHorsepower10fold$plotPred[[3]], mpgCompare, ncol = 2, top = textGrob("10-fold CV",gp=gpar(fontsize=14,font=3)))mpg vs weightset.seed(013)
mpgWeight5fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='weight', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.5fold)
mpgWeight10fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='weight', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.10fold)grid.arrange(mpgWeight5fold$plotCV[[1]], mpgWeight5fold$plotCV[[2]], mpgWeight5fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs weight (5-fold CV)", gp=gpar(fontsize=14,font=3)))mpgWeight5fold$tuningmodelComparison <- data.frame(Auto[c('mpg','weight')],
polynomial=mpgWeight5fold$best$poly$pred$pred,
step=mpgWeight5fold$best$step$pred$pred,
naturalsplines=mpgWeight5fold$best$naturalsplines$pred$pred)
weightCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'weight', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'weight', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'weight', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'weight', y = "naturalsplines", color = as.factor(4)), size = 1) +
scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgWeight5fold$plotPred[[1]], mpgWeight5fold$plotPred[[2]], mpgWeight5fold$plotPred[[3]], weightCompare, ncol = 2, top = textGrob("5-fold CV",gp=gpar(fontsize=14,font=3)))grid.arrange(mpgWeight10fold$plotCV[[1]], mpgWeight10fold$plotCV[[2]], mpgWeight10fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs weight (10-fold CV)", gp=gpar(fontsize=14,font=3)))mpgWeight10fold$tuningmodelComparison <- data.frame(Auto[c('mpg','weight')],
polynomial=mpgWeight10fold$best$poly$pred$pred,
step=mpgWeight10fold$best$step$pred$pred,
naturalsplines=mpgWeight10fold$best$naturalsplines$pred$pred)
weightCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'weight', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'weight', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'weight', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'weight', y = "naturalsplines", color = as.factor(4)), size = 1) +
scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgWeight10fold$plotPred[[1]], mpgWeight10fold$plotPred[[2]], mpgWeight10fold$plotPred[[3]], weightCompare, ncol = 2, top = textGrob("10-fold CV",gp=gpar(fontsize=14,font=3)))mpg vs accelerationset.seed(013)
mpgAcc5fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='acceleration', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.5fold)
mpgAcc10fold <- bestNonLinier(data=Auto, respon='mpg', prediktor='acceleration', model=nonlinier_model, tuning=tuning, cv.fold=dataLatih.10fold)grid.arrange(mpgAcc5fold$plotCV[[1]], mpgAcc5fold$plotCV[[2]], mpgAcc5fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs acceleration (5-fold CV)",gp=gpar(fontsize=14,font=3)))mpgAcc5fold$tuningmodelComparison <- data.frame(Auto[c('mpg','acceleration')],
polynomial=mpgAcc5fold$best$poly$pred$pred,
step=mpgAcc5fold$best$step$pred$pred,
naturalsplines=mpgAcc5fold$best$naturalsplines$pred$pred)
accCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'acceleration', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'acceleration', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'acceleration', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'acceleration', y = "naturalsplines", color = as.factor(4)), size = 1) +
# scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgAcc5fold$plotPred[[1]], mpgAcc5fold$plotPred[[2]], mpgAcc5fold$plotPred[[3]], accCompare, ncol = 2, top = textGrob("5-fold CV",gp=gpar(fontsize=14,font=3)))grid.arrange(mpgAcc10fold$plotCV[[1]], mpgAcc10fold$plotCV[[2]], mpgAcc10fold$plotCV[[3]], ncol = 2, top = textGrob("mpg vs acceleration (10-fold CV)",gp=gpar(fontsize=14,font=3)))mpgAcc10fold$tuningmodelComparison <- data.frame(Auto[c('mpg','acceleration')],
polynomial=mpgAcc10fold$best$poly$pred$pred,
step=mpgAcc10fold$best$step$pred$pred,
naturalsplines=mpgAcc10fold$best$naturalsplines$pred$pred)
accCompare <- ggplot(modelComparison) +
geom_point(aes_string(x = 'acceleration', y = 'mpg', fill = as.factor(1))) +
geom_line(aes_string(x = 'acceleration', y = "polynomial", color = as.factor(2)), size = 1) +
geom_line(aes_string(x = 'acceleration', y = "step", color = as.factor(3)), size = 1) +
geom_line(aes_string(x = 'acceleration', y = "naturalsplines", color = as.factor(4)), size = 1) +
scale_fill_manual(labels = c("Observed"), values = c("black")) +
scale_color_manual(labels = c("Polynomial", "Piecewise", "Natural Cubic Splines"), values = c("blue", "red", "green")) +
ggtitle("Perbandingan model terbaik") +
theme(legend.title = element_blank(), legend.position = "bottom", legend.box = "horizontal")
grid.arrange(mpgAcc10fold$plotPred[[1]], mpgAcc10fold$plotPred[[2]], mpgAcc10fold$plotPred[[3]], accCompare, ncol = 2, top = textGrob("10-fold CV",gp=gpar(fontsize=14,font=3)))Berdasarkan simulasi tiga model regresi non-linier di atas baik itu dengan 5-fold maupun 10-fold validasi silang, dapat ditunjukkan model terbaik baik untuk masing-masing peubah prediktor sebagai berikut
summary5fold <- cbind(prediktor=c('horsepower', 'weight', 'acceleration'), rbind(mpgHorsepower5fold$tuning[which.min(mpgHorsepower5fold$tuning$RMSE),],
mpgWeight5fold$tuning[which.min(mpgWeight5fold$tuning$RMSE),],
mpgAcc5fold$tuning[which.min(mpgAcc5fold$tuning$RMSE),]))
row.names(summary5fold) <- NULL
summary5foldsummary10fold <- cbind(prediktor=c('horsepower', 'weight', 'acceleration'), rbind(mpgHorsepower10fold$tuning[which.min(mpgHorsepower10fold$tuning$RMSE),],
mpgWeight10fold$tuning[which.min(mpgWeight10fold$tuning$RMSE),],
mpgAcc10fold$tuning[which.min(mpgAcc10fold$tuning$RMSE),]))
row.names(summary10fold) <- NULL
summary10foldHasil simulasi di atas masih perlu dilakukan pengujian berulang lebih lanjut baik itu dari jumlah parameter maupun n-fold validasi silang atau menggunakan metode lain yang lebih robust agar dapat diperoleh model yang lebih baik lagi.
Mahasiswa Pascasarjana Statistika dan Sains Data, IPB University, NIM: G1501211013, alfanugraha@apps.ipb.ac.id↩︎