library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)## Warning: package 'rsample' was built under R version 4.0.5
library(mlr3measures)## Warning: package 'mlr3measures' was built under R version 4.0.5
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(glmnet)## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
library(gridExtra)## Warning: package 'gridExtra' was built under R version 4.0.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:mlr3measures':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
library(boot)## Warning: package 'boot' was built under R version 4.0.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
Dataset yang akan digunakan yaitu data Auto yang berasal dari package ISLR. Beberapa variabel data yang terdapat di data Auto yaitu:
mpg : miles per galloncylinders : Number of cylinders between 4 and 8displacement : Engine displacement (cu. inches)horsepower : Engine horsepowerweight : Vehicle weight (lbs.)acceleration : Time to accelerate from 0 to 60 mph (sec.)year : Model year (modulo 100)origin : Origin of car (1. American, 2. European, 3. Japanese)name : Vehicle nameData terdiri dari 392 observasi
library(ISLR)## Warning: package 'ISLR' was built under R version 4.0.5
attach(Auto)## The following object is masked from package:ggplot2:
##
## mpg
head(Auto)## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Dari data Auto akan dimodelkan mpg vs horsepower, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot diperoleh:
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
theme_bw()+
ggtitle("scatterplot mpg vs horsepower")Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya
Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg (bahan bakar) dan variabel prediktornya yaitu horsepower. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawab bagian a ini akan dicari terlebih dahulu derajat terbaiknya.
set.seed(1)
deltas <- numeric(10)
model <- list()
for (i in 1:10) {
fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
deltas[i] <- cv.glm(Auto, fit, K = 10)$delta[1]
model[[i]] <- fit
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min## [1] 7
model_best <- model[[d.min]]
summary(model_best)##
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -15.5171 -2.5682 -0.2082 2.1757 15.2986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.446 0.217 108.058 < 2e-16 ***
## poly(horsepower, i)1 -120.138 4.296 -27.966 < 2e-16 ***
## poly(horsepower, i)2 44.090 4.296 10.263 < 2e-16 ***
## poly(horsepower, i)3 -3.949 4.296 -0.919 0.35856
## poly(horsepower, i)4 -5.188 4.296 -1.208 0.22794
## poly(horsepower, i)5 13.272 4.296 3.089 0.00215 **
## poly(horsepower, i)6 -8.546 4.296 -1.989 0.04737 *
## poly(horsepower, i)7 7.981 4.296 1.858 0.06397 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 18.4548)
##
## Null deviance: 23819.0 on 391 degrees of freedom
## Residual deviance: 7086.6 on 384 degrees of freedom
## AIC: 2265.2
##
## Number of Fisher Scoring iterations: 2
points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)set.seed(1)
cross_val <- vfold_cv(Auto, v=10, strata = "mpg")
derajat <- 1:10
polynom <- map_dfr(derajat, function(i){
metric_polynom <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(horsepower, derajat=i, raw = T), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynom
}
)
polynom <- cbind(derajat = derajat, polynom)
tibble(polynom)## # A tibble: 100 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 5.62 4.39
## 2 2 4.46 3.50
## 3 3 4.75 4.10
## 4 4 4.84 3.74
## 5 5 4.40 3.33
## 6 6 4.64 3.73
## 7 7 6.38 5.11
## 8 8 4.85 3.61
## 9 9 4.48 3.48
## 10 10 4.44 3.45
## # ... with 90 more rows
#berdasarkan rmse
polynom %>% slice_min(rmse)## derajat rmse mae
## 1 5 3.594957 2.684384
#berdasarkan mae
polynom %>% slice_min(mae)## derajat rmse mae
## 1 5 3.600731 2.657939
#menghitung rata-rata untuk 10 folds
mean_polynom <- colMeans(polynom)
mean_polynom## derajat rmse mae
## 5.500000 4.384655 3.314488
polinomial5 <- ggplot(Auto, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 5, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=5")polinomial7 <- ggplot(Auto, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 7, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=7")grid.arrange(polinomial5, polinomial7, ncol=2)mod_polinomial5 = lm(mpg ~ poly(horsepower,5),data=Auto)
summary(mod_polinomial5)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 5), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4326 -2.5285 -0.2925 2.1750 15.9730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2185 107.308 < 2e-16 ***
## poly(horsepower, 5)1 -120.1377 4.3259 -27.772 < 2e-16 ***
## poly(horsepower, 5)2 44.0895 4.3259 10.192 < 2e-16 ***
## poly(horsepower, 5)3 -3.9488 4.3259 -0.913 0.36190
## poly(horsepower, 5)4 -5.1878 4.3259 -1.199 0.23117
## poly(horsepower, 5)5 13.2722 4.3259 3.068 0.00231 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.326 on 386 degrees of freedom
## Multiple R-squared: 0.6967, Adjusted R-squared: 0.6928
## F-statistic: 177.4 on 5 and 386 DF, p-value: < 2.2e-16
set.seed(1)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
nbreak <- 2:8 #atau knotsnya 1-7
best_tangga <- map_dfr(nbreak, function(i){
metric_tangga <- map_dfr(cross_val$splits, function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(
lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)best_tangga <- cbind(breaks=nbreak,best_tangga)
# menampilkan hasil all breaks
best_tangga## breaks rmse mae
## 1 2 6.142665 4.792166
## 2 3 5.757634 4.523240
## 3 4 4.975044 3.786801
## 4 5 4.672002 3.564542
## 5 6 4.610925 3.514587
## 6 7 4.441881 3.330109
## 7 8 4.412751 3.386265
#berdasarkan rmse
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 8 4.412751 3.386265
#berdasarkan mae
best_tangga %>% slice_min(mae)## breaks rmse mae
## 1 7 4.441881 3.330109
karena dari nilai RMSE dan MAE nilai yang terendah terdapat dua breaks yang terbaik yaitu 7 dan 8 maka akan divisualisasikan untuk mengetahui breaks terbaik berdasarkan plot yang dihasilkan:
Piecewise7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant (Knots=6)")Piecewise8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant (Knots=7)")grid.arrange(Piecewise7, Piecewise8, ncol=2)Dari plot diperoleh bahwa dengan breks=8 lebih mendekati data sehingga dipilih breaks = 8.
Hasil pemodelan:
mod_tangga8 = lm(mpg ~ cut(horsepower,8),data=Auto)
summary(mod_tangga8) #knots = 7##
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9471 -2.6757 -0.1533 2.4015 14.5529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.9085 0.5771 58.76 <2e-16 ***
## cut(horsepower, 8)(69,92] -6.9614 0.6910 -10.07 <2e-16 ***
## cut(horsepower, 8)(92,115] -12.7551 0.7425 -17.18 <2e-16 ***
## cut(horsepower, 8)(115,138] -15.6656 1.1264 -13.91 <2e-16 ***
## cut(horsepower, 8)(138,161] -18.7799 0.8568 -21.92 <2e-16 ***
## cut(horsepower, 8)(161,184] -19.9735 1.1470 -17.41 <2e-16 ***
## cut(horsepower, 8)(184,207] -21.1228 1.7720 -11.92 <2e-16 ***
## cut(horsepower, 8)(207,230] -21.0085 1.5159 -13.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.433 on 384 degrees of freedom
## Multiple R-squared: 0.6832, Adjusted R-squared: 0.6774
## F-statistic: 118.3 on 7 and 384 DF, p-value: < 2.2e-16
Fungsi spline kubik diketahui bergantung pada knot, tetapi kita tahu jika knots yang dipilih terlalu banyak akan menghasilkan model yang overfit. Sehingga, dipilih knot dari 1-7 untuk dicari knot terbaik yang sesuai dengan data yang akan dimodelkan.
set.seed(1)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:8 #knots 1-7
best_spline <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
}
)
best_spline <- cbind(df=df, best_spline)
tibble(best_spline)## # A tibble: 70 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.73 3.44
## 2 3 4.45 3.35
## 3 4 4.12 3.38
## 4 5 4.17 3.03
## 5 6 3.76 2.83
## 6 7 4.13 3.18
## 7 8 5.80 4.32
## 8 2 3.94 2.78
## 9 3 3.79 2.81
## 10 4 4.34 3.42
## # ... with 60 more rows
Dari perhitungan berdasarkan nilai RMSE dan MAE terkecil diperoleh df=3
#berdasarkan rmse
best_spline %>% slice_min(rmse)## df rmse mae
## 1 3 3.522074 2.614708
#berdasarkan mae
best_spline %>% slice_min(mae)## df rmse mae
## 1 3 3.522074 2.614708
menghitung rata-rata untuk 10 folds
mean_best_spline <- colMeans(best_spline)
mean_best_spline## df rmse mae
## 5.000000 4.311911 3.242990
dim(ns(Auto$horsepower, knots = c(100, 200)))## [1] 392 3
atau bisa juga dengan fungsi df
dim(ns(Auto$horsepower, df=3))## [1] 392 3
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=3),"knots")## 33.33333% 66.66667%
## 84 110
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 97, 134, 171, 208)),
lty = 1,se=F)+
theme_bw()+
ggtitle("Natural cubic spline df=3 (knots = 100, 200)")mod_spline3 = lm(mpg ~ ns(horsepower, knots = c(100, 200)), data=Auto)
summary(mod_spline3)##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(100, 200)), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8663 -2.4810 -0.1824 2.2352 15.9235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.2682 0.7899 48.446 < 2e-16 ***
## ns(horsepower, knots = c(100, 200))1 -21.6528 1.2771 -16.955 < 2e-16 ***
## ns(horsepower, knots = c(100, 200))2 -43.5820 2.5991 -16.768 < 2e-16 ***
## ns(horsepower, knots = c(100, 200))3 -10.6246 1.6747 -6.344 6.24e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.359 on 388 degrees of freedom
## Multiple R-squared: 0.6905, Adjusted R-squared: 0.6881
## F-statistic: 288.5 on 3 and 388 DF, p-value: < 2.2e-16
nilai_rmse <- rbind(3.594957,
4.412751,
3.522074
)
nilai_mae <- rbind(2.657939,
3.386265,
2.614708
)
nama_model <- c("Regresi polinomial (d=5)",
"Piecewise constant (breaks=8)",
"Natural cubic spline(knots=100, 200)"
)
tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))## # A tibble: 3 x 3
## nama_model nilai_rmse nilai_mae
## <chr> <dbl> <dbl>
## 1 Regresi polinomial (d=5) 3.59 2.66
## 2 Piecewise constant (breaks=8) 4.41 3.39
## 3 Natural cubic spline(knots=100, 200) 3.52 2.61
polinomial5 <- ggplot(Auto, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 5, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol (d=5)")Piecewise8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("FT (Knots=7)")ncs3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 97, 134, 171, 208)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=3)")grid.arrange(polinomial5, Piecewise8, ncs3, ncol=3)Jika berdasarkan nilai RMSE dan MAE yang terkecil maka model terbaik yang diperoleh adalah Natural Cubic Spline dengan df=3 dimana kurvanya lebih mulus.
Lakukan hal di atas untuk masing-masing sub-set data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang). Plot model terbaik dalam satu frame. Berikan ulasan terhadap hasil Anda.
Auto_US <- Auto[Auto$origin == 1, ] %>% select(mpg, horsepower, origin)
tibble(Auto_US)## # A tibble: 245 x 3
## mpg horsepower origin
## <dbl> <dbl> <dbl>
## 1 18 130 1
## 2 15 165 1
## 3 18 150 1
## 4 16 150 1
## 5 17 140 1
## 6 15 198 1
## 7 14 220 1
## 8 14 215 1
## 9 14 225 1
## 10 15 190 1
## # ... with 235 more rows
Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Amerika (Auto_US) saja atau sebanyak 245 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:
ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
theme_bw()+
ggtitle("mpg vs horsepower, Amerika")set.seed(1)
deltas <- numeric(10)
model <- list()
for (i in 1:10) {
fit <- glm(mpg ~ poly(horsepower, i), data = Auto_US)
deltas[i] <- cv.glm(Auto_US, fit, K = 10)$delta[1]
model[[i]] <- fit
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min## [1] 7
model_best <- model[[d.min]]
summary(model_best)##
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_US)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -14.2229 -2.1009 -0.3205 2.0260 13.3482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.0335 0.2426 82.568 < 2e-16 ***
## poly(horsepower, i)1 -75.6095 3.7977 -19.909 < 2e-16 ***
## poly(horsepower, i)2 29.4684 3.7977 7.759 2.53e-13 ***
## poly(horsepower, i)3 -5.8086 3.7977 -1.529 0.127
## poly(horsepower, i)4 1.8002 3.7977 0.474 0.636
## poly(horsepower, i)5 6.0818 3.7977 1.601 0.111
## poly(horsepower, i)6 -3.0887 3.7977 -0.813 0.417
## poly(horsepower, i)7 5.8186 3.7977 1.532 0.127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 14.42288)
##
## Null deviance: 10120.8 on 244 degrees of freedom
## Residual deviance: 3418.2 on 237 degrees of freedom
## AIC: 1359
##
## Number of Fisher Scoring iterations: 2
points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)set.seed(1)
cross_val <- vfold_cv(Auto_US, v=10, strata = "mpg")
derajat <- 1:10
polynomUS <- map_dfr(derajat, function(i){
metric_polynomUS <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(horsepower, derajat=i), data=Auto_US[x$in_id,])
pred <- predict(mod, newdata=Auto_US[-x$in_id,])
truth <- Auto_US[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomUS
}
)
polynomUS <- cbind(derajat = derajat, polynomUS)
tibble(polynomUS)## # A tibble: 100 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 4.39 3.49
## 2 2 3.25 2.90
## 3 3 3.76 3.14
## 4 4 3.95 3.32
## 5 5 4.25 2.97
## 6 6 3.92 2.90
## 7 7 5.17 3.76
## 8 8 4.16 2.99
## 9 9 4.43 3.70
## 10 10 5.21 4.36
## # ... with 90 more rows
#berdasarkan rmse
polynomUS %>% slice_min(rmse)## derajat rmse mae
## 1 3 2.667124 2.237141
#berdasarkan mae
polynomUS %>% slice_min(mae)## derajat rmse mae
## 1 6 2.67477 2.071569
#menghitung rata-rata untuk 10 folds
mean_polynomUS <- colMeans(polynomUS)
mean_polynomUS## derajat rmse mae
## 5.500000 4.268948 3.074561
polinomialUS3 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 3), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol (d=3)")polinomialUS6 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 6), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol (d=6)")polinomialUS7 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 7), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol (d=7)")grid.arrange(polinomialUS3, polinomialUS6, polinomialUS7, ncol=3)mod_polinomialUS3 = lm(mpg ~ poly(horsepower,3),data=Auto_US)
summary(mod_polinomialUS3)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto_US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3092 -2.3199 -0.2081 2.0794 13.2928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.0335 0.2435 82.262 < 2e-16 ***
## poly(horsepower, 3)1 -75.6095 3.8119 -19.835 < 2e-16 ***
## poly(horsepower, 3)2 29.4684 3.8119 7.731 2.89e-13 ***
## poly(horsepower, 3)3 -5.8086 3.8119 -1.524 0.129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.812 on 241 degrees of freedom
## Multiple R-squared: 0.654, Adjusted R-squared: 0.6497
## F-statistic: 151.8 on 3 and 241 DF, p-value: < 2.2e-16
set.seed(1)
cross_val <- vfold_cv(Auto_US,v=10,strata = "mpg")
breaks <- 2:8 #atau knotsnya 1-7
best_tanggaUS <- map_dfr(breaks, function(i){
metric_tanggaUS <- map_dfr(cross_val$splits, function(x){
training <- Auto_US[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(
lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto_US[-x$in_id,]
horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggaUS
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaUS <- colMeans(metric_tanggaUS)
mean_metric_tanggaUS
}
)best_tanggaUS <- cbind(breaks=breaks,best_tanggaUS)
# menampilkan hasil all breaks
tibble(best_tanggaUS)## # A tibble: 7 x 3
## breaks rmse mae
## <int> <dbl> <dbl>
## 1 2 5.05 3.87
## 2 3 4.78 3.69
## 3 4 4.14 3.11
## 4 5 4.05 3.16
## 5 6 4.27 3.27
## 6 7 4.17 3.18
## 7 8 3.95 2.98
#berdasarkan rmse
best_tanggaUS %>% slice_min(rmse)## breaks rmse mae
## 1 8 3.954823 2.97661
#berdasarkan mae
best_tanggaUS %>% slice_min(mae)## breaks rmse mae
## 1 8 3.954823 2.97661
karena dari nilai RMSE dan MAE nilai yang terkecil terdapat break yang terbaik yaitu 8 maka:
ggplot(Auto_US ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant Knots=7")mod_tangga8 = lm(mpg ~ cut(horsepower,8),data=Auto_US)
summary(mod_tangga8)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = Auto_US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.420 -1.900 0.065 2.159 13.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4200 0.8729 33.706 < 2e-16 ***
## cut(horsepower, 8)(74.2,96.5] -4.5787 0.9824 -4.661 5.26e-06 ***
## cut(horsepower, 8)(96.5,119] -9.6780 1.0328 -9.371 < 2e-16 ***
## cut(horsepower, 8)(119,141] -12.7330 1.1935 -10.669 < 2e-16 ***
## cut(horsepower, 8)(141,163] -14.7050 1.0690 -13.756 < 2e-16 ***
## cut(horsepower, 8)(163,186] -15.4850 1.2344 -12.545 < 2e-16 ***
## cut(horsepower, 8)(186,208] -16.6343 1.7143 -9.704 < 2e-16 ***
## cut(horsepower, 8)(208,230] -16.5200 1.5118 -10.927 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.904 on 237 degrees of freedom
## Multiple R-squared: 0.6432, Adjusted R-squared: 0.6326
## F-statistic: 61.03 on 7 and 237 DF, p-value: < 2.2e-16
set.seed(1)
cross_val <- vfold_cv(Auto_US,v=10,strata = "mpg")
df <- 2:8 #knots 1-7
best_splineUS <- map_dfr(df, function(i){
metric_splineUS <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_US[x$in_id,])
pred <- predict(mod, newdata=Auto_US[-x$in_id,])
truth <- Auto_US[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splineUS
}
)
best_splineUS <- cbind(df=df, best_splineUS)
tibble(best_splineUS)## # A tibble: 70 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.45 3.70
## 2 3 2.92 2.53
## 3 4 2.82 2.37
## 4 5 3.49 2.83
## 5 6 4.00 2.53
## 6 7 2.90 2.21
## 7 8 4.72 3.34
## 8 2 3.83 2.63
## 9 3 3.57 2.66
## 10 4 5.07 4.21
## # ... with 60 more rows
#berdasarkan rmse
best_splineUS %>% slice_min(rmse)## df rmse mae
## 1 8 2.373059 1.974183
#berdasarkan mae
best_splineUS %>% slice_min(mae)## df rmse mae
## 1 5 2.373138 1.937187
menghitung rata-rata untuk 10 folds
mean_best_splineUS <- colMeans(best_splineUS)
mean_best_splineUS## df rmse mae
## 5.000000 3.800996 2.894464
dim(ns(Auto_US$horsepower, knots = c(60, 105, 150, 195)))## [1] 245 5
dim(ns(Auto_US$horsepower, knots = c(60, 85, 110, 135, 160, 185, 210)))## [1] 245 8
NCSUS5 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 105, 150, 195)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=5)")NCSUS8 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 85, 110, 135, 160, 185, 210)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=8)")grid.arrange(NCSUS5, NCSUS8, ncol=2)mod_spline5 = lm(mpg ~ ns(horsepower, knots = c(60, 105, 150, 195)),data=Auto_US)
summary(mod_spline5) #knots = 5##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 105, 150, 195)),
## data = Auto_US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6161 -2.2149 -0.3081 2.0518 13.0756
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 33.438 2.467 13.556
## ns(horsepower, knots = c(60, 105, 150, 195))1 -15.866 2.232 -7.108
## ns(horsepower, knots = c(60, 105, 150, 195))2 -17.755 3.009 -5.901
## ns(horsepower, knots = c(60, 105, 150, 195))3 -21.007 2.296 -9.151
## ns(horsepower, knots = c(60, 105, 150, 195))4 -23.779 5.358 -4.438
## ns(horsepower, knots = c(60, 105, 150, 195))5 -17.260 1.955 -8.828
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(horsepower, knots = c(60, 105, 150, 195))1 1.35e-11 ***
## ns(horsepower, knots = c(60, 105, 150, 195))2 1.23e-08 ***
## ns(horsepower, knots = c(60, 105, 150, 195))3 < 2e-16 ***
## ns(horsepower, knots = c(60, 105, 150, 195))4 1.39e-05 ***
## ns(horsepower, knots = c(60, 105, 150, 195))5 2.32e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.812 on 239 degrees of freedom
## Multiple R-squared: 0.6569, Adjusted R-squared: 0.6497
## F-statistic: 91.53 on 5 and 239 DF, p-value: < 2.2e-16
nilai_rmse <- rbind(2.667124,
3.954823,
2.373138
)
nilai_mae <- rbind(2.237141,
2.97661,
1.937187
)
nama_model <- c("Regresi polinomial 'US' (d=5)",
"Piecewise constant 'US' (breaks=8)",
"Natural cubic spline 'US' (knots=60, 105, 150, 195)"
)
tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))## # A tibble: 3 x 3
## nama_model nilai_rmse nilai_mae
## <chr> <dbl> <dbl>
## 1 Regresi polinomial 'US' (d=5) 2.67 2.24
## 2 Piecewise constant 'US' (breaks=8) 3.95 2.98
## 3 Natural cubic spline 'US' (knots=60, 105, 150, 195) 2.37 1.94
polinomialUS3 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 3), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol 'US' (d=3)")PiecewiseUS8 <- ggplot(Auto_US ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("FT 'US' Knots=7")NCSUS5 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 105, 150, 195)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS 'US' (df=5)")grid.arrange(polinomialUS3, PiecewiseUS8, NCSUS5, ncol=3)Berdasarkan nilai RMSE dan MAE terkecil diperoleh bahwa model yang terbaik yaitu Natural Cubic Spline df=5.
Auto_EU <- Auto[Auto$origin == 2, ] %>% select(mpg, horsepower, origin)
tibble(Auto_EU)## # A tibble: 68 x 3
## mpg horsepower origin
## <dbl> <dbl> <dbl>
## 1 26 46 2
## 2 25 87 2
## 3 24 90 2
## 4 25 95 2
## 5 26 113 2
## 6 28 90 2
## 7 30 70 2
## 8 30 76 2
## 9 27 60 2
## 10 23 54 2
## # ... with 58 more rows
Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Eropa (Auto_EU) saja atau sebanyak 68 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:
ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
theme_bw()+
ggtitle("mpg vs horsepower, Eropa")set.seed(1)
deltas <- numeric(10)
model <- list()
# metode polinomial
for (i in 1:10) {
fit <- glm(mpg ~ poly(horsepower, i), data = Auto_EU)
deltas[i] <- cv.glm(Auto_EU, fit, K = 10)$delta[1]
model[[i]] <- fit
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min## [1] 1
model_best <- model[[d.min]]
summary(model_best)##
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_EU)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -10.4946 -2.8387 -0.4171 2.2148 12.8858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.6029 0.5898 46.800 < 2e-16 ***
## poly(horsepower, i) -36.6027 4.8637 -7.526 1.87e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 23.65553)
##
## Null deviance: 2901.0 on 67 degrees of freedom
## Residual deviance: 1561.3 on 66 degrees of freedom
## AIC: 412.07
##
## Number of Fisher Scoring iterations: 2
points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)set.seed(1)
cross_val <- vfold_cv(Auto_EU, v=10, strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
derajat <- 1:10
polynomEU <- map_dfr(derajat, function(i){
metric_polynomEU <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(horsepower, derajat=i), data=Auto_EU[x$in_id,])
pred <- predict(mod, newdata=Auto_EU[-x$in_id,])
truth <- Auto_EU[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomEU
}
)
polynomEU <- cbind(derajat = derajat, polynomEU)
tibble(polynomEU)## # A tibble: 100 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 7.54 6.35
## 2 2 4.21 3.38
## 3 3 4.93 4.09
## 4 4 2.85 2.56
## 5 5 5.45 4.18
## 6 6 4.55 3.85
## 7 7 2.49 1.88
## 8 8 5.21 3.79
## 9 9 2.33 2.11
## 10 10 6.26 5.08
## # ... with 90 more rows
#berdasarkan rmse
polynomEU %>% slice_min(rmse)## derajat rmse mae
## 1 9 1.998743 1.641463
#berdasarkan mae
polynomEU %>% slice_min(mae)## derajat rmse mae
## 1 9 1.998743 1.641463
#menghitung rata-rata untuk 10 folds
mean_polynomEU <- colMeans(polynomEU)
mean_polynomEU## derajat rmse mae
## 5.500000 6.507944 4.678497
polinomialEU1 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 1, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=1")polinomialEU9 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 9, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=9")grid.arrange(polinomialEU1, polinomialEU9, ncol=2)mod_polinomialEU1 = lm(mpg ~ poly(horsepower,1),data=Auto_EU)
summary(mod_polinomialEU1)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 1), data = Auto_EU)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4946 -2.8387 -0.4171 2.2148 12.8858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.6029 0.5898 46.800 < 2e-16 ***
## poly(horsepower, 1) -36.6027 4.8637 -7.526 1.87e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.864 on 66 degrees of freedom
## Multiple R-squared: 0.4618, Adjusted R-squared: 0.4537
## F-statistic: 56.64 on 1 and 66 DF, p-value: 1.869e-10
set.seed(1)
cross_val <- vfold_cv(Auto_EU ,v=10,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:8 #atau knotsnya 1-7
best_tanggaEU <- map_dfr(breaks, function(i){
metric_tanggaEU <- map_dfr(cross_val$splits, function(x){
training <- Auto_EU[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(
lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto_EU[-x$in_id,]
horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggaEU
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaEU <- colMeans(metric_tanggaEU)
mean_metric_tanggaEU
}
)best_tanggaEU <- cbind(breaks=breaks,best_tanggaEU)
# menampilkan hasil all breaks
best_tanggaEU## breaks rmse mae
## 1 2 5.216476 4.164261
## 2 3 5.247791 4.313735
## 3 4 4.980629 4.177602
## 4 5 4.733983 3.847658
## 5 6 4.890634 3.975050
## 6 7 4.676543 3.812823
## 7 8 4.896469 4.024421
#berdasarkan rmse
best_tanggaEU %>% slice_min(rmse)## breaks rmse mae
## 1 7 4.676543 3.812823
#berdasarkan mae
best_tanggaEU %>% slice_min(mae)## breaks rmse mae
## 1 7 4.676543 3.812823
karena dari nilai RMSE dan MAE nilai yang terendah terdapat break yang terbaik yaitu 7 maka:
ggplot(Auto_EU ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant Knots=6")mod_tangga7 = lm(mpg ~ cut(horsepower,7),data=Auto_EU)
summary(mod_tangga7)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 7), data = Auto_EU)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9778 -3.2148 -0.1364 2.5255 12.5174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.978 1.661 21.061 < 2e-16 ***
## cut(horsepower, 7)(58.4,70.9] -4.841 2.239 -2.162 0.03456 *
## cut(horsepower, 7)(70.9,83.3] -5.995 1.959 -3.060 0.00328 **
## cut(horsepower, 7)(83.3,95.7] -10.228 2.197 -4.655 1.79e-05 ***
## cut(horsepower, 7)(95.7,108] -14.211 3.322 -4.278 6.75e-05 ***
## cut(horsepower, 7)(108,121] -13.528 2.421 -5.588 5.69e-07 ***
## cut(horsepower, 7)(121,133] -18.378 3.895 -4.718 1.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.982 on 61 degrees of freedom
## Multiple R-squared: 0.478, Adjusted R-squared: 0.4267
## F-statistic: 9.31 on 6 and 61 DF, p-value: 3.066e-07
set.seed(1)
cross_val <- vfold_cv(Auto_EU,v=10,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:8 #knots 1-7
best_splineEU <- map_dfr(df, function(i){
metric_splineEU <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_EU[x$in_id,])
pred <- predict(mod, newdata=Auto_EU[-x$in_id,])
truth <- Auto_EU[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splineEU
}
)
best_splineEU <- cbind(df=df, best_splineEU)
tibble(best_splineEU)## # A tibble: 70 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 7.54 6.34
## 2 3 4.58 3.55
## 3 4 4.96 4.11
## 4 5 2.85 2.56
## 5 6 5.46 4.18
## 6 7 4.58 3.91
## 7 8 2.47 1.87
## 8 2 5.29 3.79
## 9 3 2.31 2.12
## 10 4 6.67 5.36
## # ... with 60 more rows
#berdasarkan rmse
best_splineEU %>% slice_min(rmse)## df rmse mae
## 1 7 2.160783 1.958299
#berdasarkan mae
best_splineEU %>% slice_min(mae)## df rmse mae
## 1 8 2.470629 1.865665
menghitung rata-rata untuk 10 folds
mean_best_splineEU <- colMeans(best_splineEU)
mean_best_splineEU## df rmse mae
## 5.000000 4.831375 3.899562
dim(ns(Auto_EU$horsepower, knots = c(60, 65, 70, 75, 80, 85)))## [1] 68 7
dim(ns(Auto_EU$horsepower, knots = c(60, 65, 70, 75, 80, 85, 90)))## [1] 68 8
splineEU7 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=7)")splineEU8 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85, 90)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=8)")grid.arrange(splineEU7, splineEU8, ncol=2)mod_spline7 = lm(mpg ~ ns(horsepower, knots = c(60, 65, 70, 75, 80, 85)),data=Auto_EU)
summary(mod_spline7)##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 65, 70, 75, 80,
## 85)), data = Auto_EU)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1859 -2.8266 -0.6488 2.0932 12.3173
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 35.019 2.582 13.562
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))1 -6.072 5.219 -1.164
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))2 -4.586 4.540 -1.010
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))3 -5.289 3.828 -1.382
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))4 -7.422 3.640 -2.039
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))5 -12.595 4.183 -3.011
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))6 -15.858 8.277 -1.916
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))7 -17.319 4.220 -4.104
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))1 0.249230
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))2 0.316514
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))3 0.172190
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))4 0.045877 *
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))5 0.003807 **
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))6 0.060164 .
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))7 0.000125 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.072 on 60 degrees of freedom
## Multiple R-squared: 0.4679, Adjusted R-squared: 0.4058
## F-statistic: 7.537 on 7 and 60 DF, p-value: 1.691e-06
nilai_rmse <- rbind(7.536994,
4.676543,
2.160783
)
nilai_mae <- rbind(6.339356,
3.812823,
1.958299
)
nama_model <- c("Regresi polinomial 'EU' (d=5)",
"Piecewise constant 'EU' (breaks=8)",
"Natural cubic spline 'EU' (knots=60, 105, 150, 195)"
)
tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))## # A tibble: 3 x 3
## nama_model nilai_rmse nilai_mae
## <chr> <dbl> <dbl>
## 1 Regresi polinomial 'EU' (d=5) 7.54 6.34
## 2 Piecewise constant 'EU' (breaks=8) 4.68 3.81
## 3 Natural cubic spline 'EU' (knots=60, 105, 150, 195) 2.16 1.96
polinomialEU1 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 1, raw=T), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol 'EU' (d=1)")PiecewiseEU7 <- ggplot(Auto_EU ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("FT 'EU' Knots=6")splineEU7 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS'EU' (df=7)")grid.arrange(polinomialEU1, PiecewiseEU7, splineEU7, ncol=3)Berdasarkan nilai RMSE dan MAE terkecil diperoleh bahwa model yang terbaik yaitu Piecewise Constant breaks 7.
Auto_J <- Auto[Auto$origin == 3, ] %>% select(mpg, horsepower, origin)
tibble(Auto_J)## # A tibble: 79 x 3
## mpg horsepower origin
## <dbl> <dbl> <dbl>
## 1 24 95 3
## 2 27 88 3
## 3 27 88 3
## 4 25 95 3
## 5 31 65 3
## 6 35 69 3
## 7 24 95 3
## 8 19 97 3
## 9 28 92 3
## 10 23 97 3
## # ... with 69 more rows
Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Jepang (Auto_J) saja atau sebanyak 79 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:
ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
theme_bw()+
ggtitle("mpg vs horsepower, Jepang")set.seed(1)
deltas <- numeric(10)
model <- list()
# metode polinomial
for (i in 1:10) {
fit <- glm(mpg ~ poly(horsepower, i), data = Auto_J)
deltas[i] <- cv.glm(Auto_J, fit, K = 10)$delta[1]
model[[i]] <- fit
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min## [1] 5
model_best <- model[[d.min]]
summary(model_best)##
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_J)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.3880 -2.6375 -0.3287 1.8352 11.0450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.451 0.450 67.667 < 2e-16 ***
## poly(horsepower, i)1 -36.203 4.000 -9.051 1.49e-13 ***
## poly(horsepower, i)2 9.197 4.000 2.299 0.0244 *
## poly(horsepower, i)3 16.699 4.000 4.175 8.14e-05 ***
## poly(horsepower, i)4 -1.574 4.000 -0.394 0.6951
## poly(horsepower, i)5 6.963 4.000 1.741 0.0859 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 15.99807)
##
## Null deviance: 2892.9 on 78 degrees of freedom
## Residual deviance: 1167.9 on 73 degrees of freedom
## AIC: 450.98
##
## Number of Fisher Scoring iterations: 2
points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)set.seed(1)
cross_val <- vfold_cv(Auto_J, v=10, strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
derajat <- 1:10
polynomJ <- map_dfr(derajat, function(i){
metric_polynomJ <- map_dfr(cross_val$splits, function(x){
mod <- lm(mpg ~ poly(horsepower, derajat=i, raw = T), data=Auto_J[x$in_id,])
pred <- predict(mod, newdata=Auto_J[-x$in_id,])
truth <- Auto_J[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse, mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polynomJ
}
)## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading
polynomJ <- cbind(derajat = derajat, polynomJ)
tibble(polynomJ)## # A tibble: 100 x 3
## derajat rmse mae
## <int> <dbl> <dbl>
## 1 1 4.26 3.93
## 2 2 5.48 4.61
## 3 3 3.60 3.03
## 4 4 3.11 2.71
## 5 5 3.26 2.38
## 6 6 2.73 2.11
## 7 7 5.15 3.14
## 8 8 7.72 5.56
## 9 9 4.99 4.01
## 10 10 5.90 4.70
## # ... with 90 more rows
#berdasarkan rmse
polynomJ %>% slice_min(rmse)## derajat rmse mae
## 1 4 2.427555 2.023453
#berdasarkan mae
polynomJ %>% slice_min(mae)## derajat rmse mae
## 1 4 2.435324 1.828376
#menghitung rata-rata untuk 10 folds
mean_polynomJ <- colMeans(polynomJ)
mean_polynomJ## derajat rmse mae
## 5.500000 9.652879 5.487174
polinomialJ4 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=4")polinomialJ5 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 5), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regresi polinomial d=5")grid.arrange(polinomialJ4, polinomialJ5, ncol=2)mod_polinomialJ4 = lm(mpg ~ poly(horsepower,4),data=Auto_J)
summary(mod_polinomialJ4)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 4), data = Auto_J)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6872 -2.5927 -0.4482 2.1121 11.5842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.4506 0.4561 66.757 < 2e-16 ***
## poly(horsepower, 4)1 -36.2030 4.0543 -8.930 2.26e-13 ***
## poly(horsepower, 4)2 9.1966 4.0543 2.268 0.0262 *
## poly(horsepower, 4)3 16.6991 4.0543 4.119 9.81e-05 ***
## poly(horsepower, 4)4 -1.5741 4.0543 -0.388 0.6989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.054 on 74 degrees of freedom
## Multiple R-squared: 0.5795, Adjusted R-squared: 0.5568
## F-statistic: 25.5 on 4 and 74 DF, p-value: 2.684e-13
set.seed(1)
cross_val <- vfold_cv(Auto_J ,v=10,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
breaks <- 2:5 #atau knotsnya 1-4
best_tanggaJ <- map_dfr(breaks, function(i){
metric_tanggaJ <- map_dfr(cross_val$splits, function(x){
training <- Auto_J[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(
lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto_J[-x$in_id,]
horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tanggaJ
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaJ <- colMeans(metric_tanggaJ)
mean_metric_tanggaJ
}
)best_tanggaJ <- cbind(breaks=breaks,best_tanggaJ)
# menampilkan hasil all breaks
best_tanggaJ## breaks rmse mae
## 1 2 4.367104 3.284685
## 2 3 4.119029 3.335941
## 3 4 4.197867 3.273378
## 4 5 4.203681 3.339308
#berdasarkan rmse
best_tanggaJ %>% slice_min(rmse)## breaks rmse mae
## 1 3 4.119029 3.335941
#berdasarkan mae
best_tanggaJ %>% slice_min(mae)## breaks rmse mae
## 1 4 4.197867 3.273378
karena dari nilai RMSE dan MAE nilai yang terecil terdapat break yang terbaik yaitu 3 dan 4 maka:
PiecewiseJ3 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,3),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant Knots=2")PiecewiseJ4 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Piecewise constant Knots=3")grid.arrange(PiecewiseJ3, PiecewiseJ4, ncol=2)mod_tangga4 = lm(mpg ~ cut(horsepower,4),data=Auto_J)
summary(mod_tangga4)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 4), data = Auto_J)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.444 -2.644 -0.590 2.579 11.803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.7973 0.6901 50.425 < 2e-16 ***
## cut(horsepower, 4)(72,92] -5.3529 1.2063 -4.438 3.07e-05 ***
## cut(horsepower, 4)(92,112] -10.5073 1.1650 -9.019 1.37e-13 ***
## cut(horsepower, 4)(112,132] -9.2223 2.2093 -4.174 7.97e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.198 on 75 degrees of freedom
## Multiple R-squared: 0.5432, Adjusted R-squared: 0.5249
## F-statistic: 29.73 on 3 and 75 DF, p-value: 9.028e-13
set.seed(1)
cross_val <- vfold_cv(Auto_J,v=10,strata = "mpg")## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.
df <- 2:8 # knots 1-7
best_splineJ <- map_dfr(df, function(i){
metric_splineJ <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_J[x$in_id,])
pred <- predict(mod, newdata=Auto_J[-x$in_id,])
truth <- Auto_J[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_splineJ
}
)
best_splineJ <- cbind(df=df, best_splineJ)
tibble(best_splineJ)## # A tibble: 70 x 3
## df rmse mae
## <int> <dbl> <dbl>
## 1 2 4.08 3.67
## 2 3 5.49 4.58
## 3 4 3.96 3.07
## 4 5 3.17 2.69
## 5 6 3.25 2.39
## 6 7 3.77 3.19
## 7 8 5.08 3.03
## 8 2 8.25 5.88
## 9 3 4.94 3.95
## 10 4 5.95 4.85
## # ... with 60 more rows
#berdasarkan rmse
best_splineJ %>% slice_min(rmse)## df rmse mae
## 1 8 2.440677 2.025495
#berdasarkan mae
best_splineJ %>% slice_min(mae)## df rmse mae
## 1 6 2.534937 1.84849
menghitung rata-rata untuk 10 folds
mean_best_splineJ <- colMeans(best_splineJ)
mean_best_splineJ## df rmse mae
## 5.000000 4.350359 3.311150
dim(ns(Auto_J$horsepower, knots = c(60, 70, 80, 90, 100)))## [1] 79 6
dim(ns(Auto_J$horsepower, knots = c(60, 65, 70, 75, 80, 85, 90)))## [1] 79 8
splineJ6 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 70, 80, 90, 100)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=6)")splineJ8 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85, 90)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS (df=8)")grid.arrange(splineJ6, splineJ8, ncol=2)mod_spline6 = lm(mpg ~ ns(horsepower, knots = c(60, 70, 80, 90, 100)),data=Auto_J)
summary(mod_spline6)##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 70, 80, 90, 100)),
## data = Auto_J)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3101 -2.5469 -0.3148 1.5988 10.7935
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 32.295 2.170 14.885
## ns(horsepower, knots = c(60, 70, 80, 90, 100))1 1.311 3.063 0.428
## ns(horsepower, knots = c(60, 70, 80, 90, 100))2 -4.295 4.826 -0.890
## ns(horsepower, knots = c(60, 70, 80, 90, 100))3 -3.902 2.849 -1.369
## ns(horsepower, knots = c(60, 70, 80, 90, 100))4 -16.676 3.668 -4.546
## ns(horsepower, knots = c(60, 70, 80, 90, 100))5 -2.550 5.587 -0.456
## ns(horsepower, knots = c(60, 70, 80, 90, 100))6 -5.274 3.772 -1.398
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(horsepower, knots = c(60, 70, 80, 90, 100))1 0.670
## ns(horsepower, knots = c(60, 70, 80, 90, 100))2 0.376
## ns(horsepower, knots = c(60, 70, 80, 90, 100))3 0.175
## ns(horsepower, knots = c(60, 70, 80, 90, 100))4 2.16e-05 ***
## ns(horsepower, knots = c(60, 70, 80, 90, 100))5 0.650
## ns(horsepower, knots = c(60, 70, 80, 90, 100))6 0.166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.064 on 72 degrees of freedom
## Multiple R-squared: 0.589, Adjusted R-squared: 0.5548
## F-statistic: 17.2 on 6 and 72 DF, p-value: 3.165e-12
nilai_rmse <- rbind(2.427555,
4.197867,
2.534937
)
nilai_mae <- rbind(2.023453,
3.273378,
1.84849
)
nama_model <- c("Regresi polinomial 'J' (d=5)",
"Piecewise constant 'J' (breaks=4)",
"Natural cubic spline 'J' (knots=60, 70, 80, 90, 100)"
)
tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))## # A tibble: 3 x 3
## nama_model nilai_rmse nilai_mae
## <chr> <dbl> <dbl>
## 1 Regresi polinomial 'J' (d=5) 2.43 2.02
## 2 Piecewise constant 'J' (breaks=4) 4.20 3.27
## 3 Natural cubic spline 'J' (knots=60, 70, 80, 90, 100) 2.53 1.85
polinomialJ4 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
theme_bw()+
ggtitle("Regpol 'J' (d=4)")PiecewiseJ4 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("FT 'J' Knots=3")splineJ7 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60, 70, 80, 90, 100)),
lty = 1,se=F)+
theme_bw()+
ggtitle("NCS 'J' (df=7)")grid.arrange(polinomialJ4, PiecewiseJ4, splineJ7, ncol=3)Berdasarkan nilai RMSE dan MAE terkecil model terbaik yang diperoleh adalah Naturral Cubic Spline tetapi jika dilihat dari kurva yang tidak mulus maka diambil nilai RMSE dan MAE terkecil kedua yaitu Piecewise Constant dengan knots= 3.
Sartono, B. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
TERIMAKASIH