Soal
Data bisa diperoleh untuk tugas dapat diperoleh di package ISLR. Silahkan Install dulu packagenya.
Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.
Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)
library(tidyverse)
library(splines)
library(ISLR)
library(ggplot2)
library(rsample)
library(caret)
head(Auto) mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
5 ford torino
6 ford galaxie 500
mpg Vs horsepower
Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.
Scatter Plot mpg Vs horsepower
Regresi Linier
set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ horsepower,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.92 3.98
2 5.31 3.77
3 4.44 3.44
4 4.55 3.71
5 5.43 4.47
6 4.75 3.75
7 5.25 4.37
8 4.83 3.92
9 5.16 3.75
10 4.34 3.18
rmse mae
4.898189 3.834290
Regresi Polinomial
polinomial = function(dataset){
modelterbaik = c()
derajat = 1:10
for (i in derajat){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame)
function(x){
mod = lm(mpg ~ poly(horsepower,i,raw = T),
data=dataset[x$in_id,]) #in_id a/ data training
pred = predict(mod,
newdata=dataset[-x$in_id,])
# nilai prediksi untuk data testing
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
rata_rata = colMeans(metric_poly)
modelterbaik = c(modelterbaik,rata_rata)
}
M = matrix(modelterbaik, byrow = T,ncol = 2)
colnames(M) = c("rmse","mae")
bestmpoly = as.data.frame(cbind(derajat,M))
bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly derajat rmse mae
1 7 4.328586 3.239488
2 5 4.343273 3.248301
3 6 4.345171 3.262537
4 2 4.347103 3.263367
5 8 4.351402 3.260481
6 3 4.354304 3.271791
7 4 4.374685 3.290646
8 9 4.380581 3.269677
9 10 4.415139 3.287987
10 1 4.898189 3.834290
Secara empiris regresi polinomial dengan derajat 7 memiliki rmse maupun mae terkecil. Namun, di sini dipilih regresi polinomial dengan derajat 2 karena jika lebih dari dejarat 4, akan ada indikasi maalasah overfitting.
Hal ini karena meskipun memiliki rmse atau mae yang kecil, polinomial dengan derajat lebih besar dari 4 cenderung memiliki pola yang tidak teratur, khususnya pada ujung-ujung kurvanya.
Fungsi Tangga (Piecewise Constant)
ftangga = function(dataset){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg")
breaks = 2:15
best_tangga = map_dfr(breaks, function(i){
metric_tangga = map_dfr(cross_val$splits,
function(x){
training = dataset[x$in_id,]
training$horsepower = cut(training$horsepower,i) #Transformasi jadi factor
mod = lm(mpg ~ horsepower,data=training)
#Convert ke numeric lagi
labs_x = levels(mod$model[,2])
labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*",
"\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]",
"\\1", labs_x) ))
#Testing
testing = dataset[-x$in_id,]
horsepower_new = cut(testing$horsepower,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred = predict(mod,
newdata=list(horsepower=horsepower_new))
truth = testing$mpg
data_eval = na.omit(data.frame(truth,pred))
rmse = mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred)
mae = mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
}
)
best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
best_tangga = arrange(best_tangga, best_tangga$rmse)
return(best_tangga)
}
tangga = ftangga(Auto)
tangga breaks rmse mae
1 15 4.343245 3.291338
2 14 4.401883 3.257563
3 8 4.436932 3.405170
4 12 4.451700 3.314559
5 7 4.499764 3.359511
6 13 4.511050 3.357282
7 9 4.526533 3.447286
8 11 4.538542 3.350069
9 10 4.578024 3.435559
10 6 4.655029 3.532977
11 5 4.705711 3.568395
12 4 4.990599 3.790641
13 3 5.771438 4.520390
14 2 6.151124 4.792086
Berdasarkan rmse terkecil banyaknya interval yang terbaik adalah 15 sedangkan berdasarkan mae terkecil banyaknya interval terbaik adalah 14.
Natural Cubic Spline
ncspline = function(dataset){
set.seed(1501211036)
cross.val = vfold_cv(dataset,v=10,strata = "mpg")
df = 3:20
best.spline3 = map_dfr(df, function(i){
metric.spline3 = map_dfr(cross.val$splits,
function(x){
mod = lm(mpg ~ splines::ns(horsepower,df=i),data=dataset[x$in_id,])
pred = predict(mod,newdata=dataset[-x$in_id,])
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
mean.metric.spline3
}
)
best.spline3 = cbind(df=df,best.spline3)
basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
df=basis[,"df"]
knot1 = attr(ns(dataset$horsepower, df=df[1]),"knots")
knot2 = attr(ns(dataset$horsepower, df=df[2]),"knots")
return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto$basis
df rmse mae
1 7 4.320925 3.230518
2 10 4.323890 3.217920
$knot1
14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
68.85714 79.71429 89.57143 98.00000 113.57143 150.00000
$knot2
10% 20% 30% 40% 50% 60% 70% 80% 90%
67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Berdasarkan hasil di atas model natural spline terbaik memiliki 7 fungsi basis atau 6 knots
Perbandingan Model
a = as.matrix(bestmpoly[4,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 15 Interval",
"Natural Cubic Spline 6 Knot", "Model Linier")
as.data.frame(perbandingan.model) rmse mae
Polinomial Derajat 2 4.347103 3.263367
Fungsi Tangga 15 Interval 4.343245 3.291338
Natural Cubic Spline 6 Knot 4.320925 3.230518
Model Linier 4.898189 3.834290
Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs horsepower adalah Natural Cubic Spline 6 Knot
mpg Vs displacement
Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.
Scatter Plot mpg Vs displacement
Regresi Linier
set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ displacement,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.79 3.72
2 4.80 3.39
3 4.22 3.01
4 4.18 3.53
5 4.90 3.51
6 5.01 3.96
7 4.97 4.04
8 4.05 3.29
9 4.68 3.48
10 4.64 3.20
rmse mae
4.624689 3.512835
Regresi Polinomial
polinomial = function(dataset){
modelterbaik = c()
derajat = 1:10
for (i in derajat){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame)
function(x){
mod = lm(mpg ~ poly(displacement,i,raw = T),
data=dataset[x$in_id,]) #in_id a/ data training
pred = predict(mod,
newdata=dataset[-x$in_id,])
# nilai prediksi untuk data testing
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
rata_rata = colMeans(metric_poly)
modelterbaik = c(modelterbaik,rata_rata)
}
M = matrix(modelterbaik, byrow = T,ncol = 2)
colnames(M) = c("rmse","mae")
bestmpoly = as.data.frame(cbind(derajat,M))
bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly derajat rmse mae
1 10 4.162502 3.119952
2 9 4.203314 3.147555
3 8 4.253503 3.209042
4 7 4.299290 3.202670
5 6 4.353357 3.210716
6 2 4.356654 3.167928
7 3 4.358329 3.175692
8 4 4.368521 3.180040
9 5 4.371482 3.185649
10 1 4.624689 3.512835
Secara empiris regresi polinomial dengan derajat 7 memiliki rmse maupun mae terkecil. Namun, di sini dipilih regresi polinomial dengan derajat 2 karena jika lebih dari dejarat 4, akan ada indikasi maalasah overfitting.
Hal ini karena meskipun memiliki rmse atau mae yang kecil, polinomial dengan derajat lebih besar dari 4 cenderung memiliki pola yang tidak teratur, khususnya pada ujung-ujung kurvanya.
Fungsi Tangga (Piecewise Constant)
ftangga = function(dataset){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg")
breaks = 2:10
best_tangga = map_dfr(breaks, function(i){
metric_tangga = map_dfr(cross_val$splits,
function(x){
training = dataset[x$in_id,]
training$displacement = cut(training$displacement,i) #Transformasi jadi factor
mod = lm(mpg ~ displacement,data=training)
#Convert ke numeric lagi
labs_x = levels(mod$model[,2])
labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*",
"\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]",
"\\1", labs_x) ))
#Testing
testing = dataset[-x$in_id,]
displacement_new = cut(testing$displacement,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred = predict(mod,
newdata=list(displacement=displacement_new))
truth = testing$mpg
data_eval = na.omit(data.frame(truth,pred))
rmse = mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred)
mae = mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
}
)
best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
best_tangga = arrange(best_tangga, best_tangga$rmse)
return(best_tangga)
}
tangga = ftangga(Auto)
tangga breaks rmse mae
1 9 4.237568 3.093370
2 10 4.277541 3.132205
3 8 4.393760 3.236799
4 7 4.614666 3.384960
5 6 4.641657 3.409467
6 5 4.715635 3.508383
7 4 4.854343 3.630170
8 3 4.941093 3.679058
9 2 5.932837 4.636662
Berdasarkan rmse dan mae terkecil banyaknya interval yang terbaik adalah 9.
Natural Cubic Spline
ncspline = function(dataset){
set.seed(1501211036)
cross.val = vfold_cv(dataset,v=10,strata = "mpg")
df = 3:20
best.spline3 = map_dfr(df, function(i){
metric.spline3 = map_dfr(cross.val$splits,
function(x){
mod = lm(mpg ~ splines::ns(displacement,df=i),data=dataset[x$in_id,])
pred = predict(mod,newdata=dataset[-x$in_id,])
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
mean.metric.spline3
}
)
best.spline3 = cbind(df=df,best.spline3)
basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
df=basis[,"df"]
knot1 = attr(ns(dataset$displacement, df=df[1]),"knots")
knot2 = attr(ns(dataset$displacement, df=df[2]),"knots")
return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto$basis
df rmse mae
1 12 4.12221 3.064286
2 12 4.12221 3.064286
$knot1
8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
75% 83.33333% 91.66667%
275.7500 318.0000 351.0000
$knot2
8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
75% 83.33333% 91.66667%
275.7500 318.0000 351.0000
Berdasarkan hasil di atas model natural spline terbaik memiliki 12 fungsi basis atau 11 knots
Perbandingan Model
a = as.matrix(bestmpoly[6,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 9 Interval",
"Natural Cubic Spline 11 Knot", "Model Linier")
as.data.frame(perbandingan.model) rmse mae
Polinomial Derajat 2 4.356654 3.167928
Fungsi Tangga 9 Interval 4.237568 3.093370
Natural Cubic Spline 11 Knot 4.122210 3.064286
Model Linier 4.624689 3.512835
Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs displacement adalah Natural Cubic Spline 11 Knot
mpg Vs weight
Berdasarkan scatter Plot, sebaran data tidak linier, secara visual terdapat bukti untuk hubungan non-linier. Lebih lanjut, hubungan ini ada diuji secara empiris melalui model regresi non-linier, yaitu polinomial, fungsi tangga dan natural cubic spline.
Scatter Plot mpg Vs weight
mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
5 ford torino
6 ford galaxie 500
Regresi Linier
set.seed(1501211036)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ weight,
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.48 3.56
2 4.46 3.21
3 4.25 3.29
4 3.49 2.69
5 4.60 3.34
6 4.74 3.65
7 4.27 3.48
8 4.14 3.33
9 4.46 3.39
10 4.37 2.98
rmse mae
4.326813 3.293198
Regresi Polinomial
polinomial = function(dataset){
modelterbaik = c()
derajat = 1:10
for (i in derajat){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg") #rsample
metric_poly = map_dfr(cross_val$splits, #tidiverse Iterasi pada suatu fungsi (Output data.frame)
function(x){
mod = lm(mpg ~ poly(weight,i,raw = T),
data=dataset[x$in_id,]) #in_id a/ data training
pred = predict(mod,
newdata=dataset[-x$in_id,])
# nilai prediksi untuk data testing
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
rata_rata = colMeans(metric_poly)
modelterbaik = c(modelterbaik,rata_rata)
}
M = matrix(modelterbaik, byrow = T,ncol = 2)
colnames(M) = c("rmse","mae")
bestmpoly = as.data.frame(cbind(derajat,M))
bestmpoly = arrange(bestmpoly, bestmpoly$rmse)
return(bestmpoly)
}
bestmpoly = polinomial(Auto)
bestmpoly derajat rmse mae
1 2 4.171196 3.073073
2 3 4.178035 3.081339
3 4 4.185834 3.078166
4 5 4.186407 3.085107
5 6 4.191242 3.106216
6 7 4.203473 3.118155
7 8 4.217516 3.122607
8 9 4.258438 3.160485
9 10 4.292933 3.159856
10 1 4.326813 3.293198
Secara empiris regresi polinomial dengan derajat 72 memiliki rmse maupun mae terkecil.
Fungsi Tangga (Piecewise Constant)
ftangga = function(dataset){
set.seed(1501211036)
cross_val = vfold_cv(dataset,v=10,strata = "mpg")
breaks = 2:10
best_tangga = map_dfr(breaks, function(i){
metric_tangga = map_dfr(cross_val$splits,
function(x){
training = dataset[x$in_id,]
training$weight = cut(training$weight,i) #Transformasi jadi factor
mod = lm(mpg ~ weight,data=training)
#Convert ke numeric lagi
labs_x = levels(mod$model[,2])
labs_x_breaks = cbind(lower = as.numeric( sub("\\((.+),.*",
"\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]",
"\\1", labs_x) ))
#Testing
testing = dataset[-x$in_id,]
weight_new = cut(testing$weight,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred = predict(mod,
newdata=list(weight=weight_new))
truth = testing$mpg
data_eval = na.omit(data.frame(truth,pred))
rmse = mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred)
mae = mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
colMeans(metric_tangga) # menghitung rata-rata untuk 10 folds
}
)
best_tangga = cbind(breaks=breaks,best_tangga) # menampilkan hasil all breaks
best_tangga = arrange(best_tangga, best_tangga$rmse)
return(best_tangga)
}
tangga = ftangga(Auto)
tangga breaks rmse mae
1 6 4.225604 3.126638
2 7 4.312179 3.174580
3 9 4.317814 3.153454
4 10 4.330555 3.176004
5 8 4.402451 3.226718
6 5 4.415531 3.261200
7 4 4.648455 3.561651
8 3 4.876442 3.661495
9 2 5.540988 4.297736
Berdasarkan rmse dan mae terkecil banyaknya interval yang terbaik adalah 6.
Natural Cubic Spline
ncspline = function(dataset){
set.seed(1501211036)
cross.val = vfold_cv(dataset,v=10,strata = "mpg")
df = 3:20
best.spline3 = map_dfr(df, function(i){
metric.spline3 = map_dfr(cross.val$splits,
function(x){
mod = lm(mpg ~ splines::ns(weight,df=i),data=dataset[x$in_id,])
pred = predict(mod,newdata=dataset[-x$in_id,])
truth = dataset[-x$in_id,]$mpg
rmse = mlr3measures::rmse(truth = truth,
response = pred)
mae = mlr3measures::mae(truth = truth,
response = pred)
metric = c(rmse,mae)
names(metric) = c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 = colMeans(metric.spline3) #rata-rata untuk 10 folds
mean.metric.spline3
}
)
best.spline3 = cbind(df=df,best.spline3)
basis=data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
df=basis[,"df"]
knot1 = attr(ns(dataset$weight, df=df[1]),"knots")
knot2 = attr(ns(dataset$weight, df=df[2]),"knots")
return(list(basis=basis,knot1=knot1,knot2=knot2))
}
basisAuto = ncspline(Auto)
basisAuto$basis
df rmse mae
1 11 4.177159 3.071718
2 11 4.177159 3.071718
$knot1
9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727%
1985.000 2130.000 2264.636 2491.818 2676.364 2936.273 3228.364 3521.818
81.81818% 90.90909%
3896.545 4317.909
$knot2
9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727%
1985.000 2130.000 2264.636 2491.818 2676.364 2936.273 3228.364 3521.818
81.81818% 90.90909%
3896.545 4317.909
Berdasarkan hasil di atas model natural spline terbaik memiliki 11 fungsi basis atau 10 knots
Perbandingan Model
a = as.matrix(bestmpoly[1,])
b = as.matrix(tangga[1,])
c = as.matrix(basisAuto$basis[1,])
d = t(as.matrix(mean_metric_linear))
perbandingan.model = rbind(a[,2:3],b[,2:3],c[,2:3],d)
rownames(perbandingan.model) = c("Polinomial Derajat 2","Fungsi Tangga 6 Interval",
"Natural Cubic Spline 10 Knot", "Model Linier")
as.data.frame(perbandingan.model) rmse mae
Polinomial Derajat 2 4.171196 3.073073
Fungsi Tangga 6 Interval 4.225604 3.126638
Natural Cubic Spline 10 Knot 4.177159 3.071718
Model Linier 4.326813 3.293198
Berdasarkan data empiris perbandingan di atas, model terbaik yang dapat digunakan untuk memodelkan mpg Vs weight adalah Natural Cubic Spline 10 Knot