Tugas Praktikum 8: Data bisa diperoleh di package ISL, silahkan install dulu packagenya. library yang digunakan
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.5
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
Pada data Auto gunakan peubah respon mpg dan pilih tiga kolom sebagai untuk dijadikan peubah prediktor. Peubah yang dipilih yaitu displacement, horsepower, dan weight sebagai berikut.
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
head(cbind(mpg, displacement, horsepower, weight))
## mpg displacement horsepower weight
## [1,] 18 307 130 3504
## [2,] 15 350 165 3693
## [3,] 18 318 150 3436
## [4,] 16 304 150 3433
## [5,] 17 302 140 3449
## [6,] 15 429 198 4341
Berikut adalah bukti untuk hubungan non-linear antara peubah respons dan peubah prediktor yang telah dipilih.
plot(displacement, mpg, pch=19, col="steelblue",
xlab="displacement", ylab="mile per gallon")
Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.
plot(horsepower, mpg, pch=19, col="cadetblue4",
xlab="Horse Power", ylab="mile per gallon")
Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.
plot(weight, mpg, pch=19, col="aquamarine4",
xlab="weight", ylab="mile per gallon")
Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.
Tentukan model non-linear terbaik untuk masing-masing pasangan peubah. Pada pembahasan ini, model akan dipilih secara empiris dan diakhir akan diberikan visualisasi dari model non-linear terbaiknya.
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(displacement, d = i),
data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
## d rmse mae
## 1 1 4.599198 3.514027
## 2 2 4.323459 3.160102
## 3 3 4.333839 3.173524
## 4 4 4.351392 3.182745
## 5 5 4.360522 3.194475
## 6 6 4.335131 3.213242
## 7 7 4.282913 3.216382
## 8 8 4.231056 3.203589
## 9 9 4.187063 3.149285
## 10 10 4.144569 3.127848
dengan visualisasi polinom terbaik yaitu polinom ordo 10
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="steelblue") +
stat_smooth(method = "lm",
formula = y~poly(x,10,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:11
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## d rmse mae
## 1 2 5.919030 4.624448
## 2 3 4.888232 3.670858
## 3 4 4.803193 3.614535
## 4 5 4.674843 3.507489
## 5 6 4.605366 3.415135
## 6 7 4.559773 3.367354
## 7 8 4.388631 3.245010
## 8 9 4.250727 3.110926
## 9 10 4.227783 3.091653
## 10 11 4.355140 3.233770
dengan visualisasi fungsi tangga terbaik yaitu dengan break=10 berikut.
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="steelblue") +
stat_smooth(method = "lm",
formula = y~cut(x,10),
lty = 1, col = "red",se = F)+
theme_bw()
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 3:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
## d rmse mae
## 1 3 4.339193 3.182233
## 2 4 4.358236 3.190241
## 3 5 4.318527 3.204870
## 4 6 4.239601 3.148019
## 5 7 4.188651 3.119848
## 6 8 4.178569 3.134165
## 7 9 4.153125 3.129453
## 8 10 4.142579 3.116624
## 9 11 4.115266 3.076424
## 10 12 4.106482 3.058926
nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
best_tangga %>% select(-1) %>% slice_min(mae),
best_spline3 %>% select(-1)%>% slice_min(mae)
)
nama_model <- c("Polynomial10","Piecewise constant(breaks=10)",
"Natural cubic spline(df=12)"
)
data.frame(nama_model,nilai_metric)
## nama_model rmse mae
## 1 Polynomial10 4.144569 3.127848
## 2 Piecewise constant(breaks=10) 4.227783 3.091653
## 3 Natural cubic spline(df=12) 4.106482 3.058926
Dapat dilihat bahwa model terbaik yaitu natural cubic spline dengan df=12 karena memiliki rmse dan mae terkecil.
attr(ns(Auto$displacement, df=12),"knots")
## 8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
## 89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
## 75% 83.33333% 91.66667%
## 275.7500 318.0000 351.0000
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="steelblue") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(89.0000, 97.0000, 105.0000, 119.0000, 130.9167, 151.0000, 200.0000, 232.0000, 275.7500, 318.0000, 351.0000)),
lty = 1,se = F, col="red")
Model spline ini terlihat lebih pas untuk menggambarkan data displacement vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower, d = i),
data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
## d rmse mae
## 1 1 4.908935 3.858832
## 2 2 4.352806 3.261129
## 3 3 4.356807 3.264025
## 4 4 4.354783 3.274987
## 5 5 4.307652 3.238723
## 6 6 4.301354 3.241417
## 7 7 4.287438 3.222540
## 8 8 4.302908 3.240966
## 9 9 4.321277 3.247392
## 10 10 4.336836 3.257891
dengan visualisasi polinom terbaik yaitu polinom ordo 7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="cadetblue4") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## d rmse mae
## 1 2 6.122422 4.776980
## 2 3 5.766535 4.520709
## 3 4 5.020265 3.805185
## 4 5 4.706711 3.572772
## 5 6 4.658138 3.539010
## 6 7 4.572175 3.405770
## 7 8 4.438716 3.398562
## 8 9 4.498963 3.435017
## 9 10 4.569136 3.437797
dengan visualisasi fungsi tangga terbaik yaitu dengan break=8 berikut.
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="cadetblue4") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "red",se = F)+
theme_bw()
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 3:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
## d rmse mae
## 1 3 4.342637 3.259033
## 2 4 4.312147 3.242972
## 3 5 4.300381 3.235026
## 4 6 4.298908 3.229285
## 5 7 4.275443 3.217522
## 6 8 4.303139 3.250454
## 7 9 4.293479 3.231677
## 8 10 4.277135 3.199777
## 9 11 4.341622 3.255280
## 10 12 4.314328 3.216725
nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
best_tangga %>% select(-1) %>% slice_min(mae),
best_spline3 %>% select(-1)%>% slice_min(mae)
)
nama_model <- c("Polynomial7","Piecewise constant(breaks=8)",
"Natural cubic spline(df=10)"
)
data.frame(nama_model,nilai_metric)
## nama_model rmse mae
## 1 Polynomial7 4.287438 3.222540
## 2 Piecewise constant(breaks=8) 4.438716 3.398562
## 3 Natural cubic spline(df=10) 4.277135 3.199777
Dapat dilihat bahwa model terbaik yaitu natural cubic spline dengan df=10 karena memiliki rmse dan mae terkecil.
attr(ns(Auto$horsepower, df=10),"knots")
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="cadetblue4") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(67.0, 72.0, 80.0, 88.0, 93.5, 100.0, 110.0, 140.0, 157.7)),
lty = 1,se = F, col="red")
Model spline ini terlihat lebih pas untuk menggambarkan data horsepower vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight, d = i),
data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
## d rmse mae
## 1 1 4.306138 3.296115
## 2 2 4.147615 3.080702
## 3 3 4.156313 3.086989
## 4 4 4.161158 3.084023
## 5 5 4.165673 3.099482
## 6 6 4.182153 3.127319
## 7 7 4.179790 3.131916
## 8 8 4.208312 3.153829
## 9 9 4.268580 3.195660
## 10 10 4.234068 3.169642
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## d rmse mae
## 1 2 5.617379 4.355730
## 2 3 4.825537 3.633163
## 3 4 4.716154 3.643745
## 4 5 4.417889 3.256344
## 5 6 4.183642 3.108463
## 6 7 4.296189 3.222659
## 7 8 4.390219 3.283052
## 8 9 4.314847 3.158501
## 9 10 4.339894 3.199744
dengan visualisasi fungsi tangga terbaik yaitu dengan break=6 berikut.
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="aquamarine4") +
stat_smooth(method = "lm",
formula = y~cut(x,6),
lty = 1, col = "red",se = F)+
theme_bw()
set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 3:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
## d rmse mae
## 1 3 4.158586 3.083967
## 2 4 4.175023 3.094981
## 3 5 4.180656 3.109785
## 4 6 4.172720 3.123068
## 5 7 4.162708 3.089706
## 6 8 4.173852 3.105423
## 7 9 4.176375 3.096280
## 8 10 4.160552 3.096405
## 9 11 4.158496 3.096378
## 10 12 4.167825 3.105770
dengan visualisasi spline terbaik yaitu dengan df=3
attr(ns(Auto$weight, df=3),"knots")
## 33.33333% 66.66667%
## 2397.000 3333.667
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="aquamarine4") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(2397.000, 3333.667)),
lty = 1,se = F, col="red")
nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
best_tangga %>% select(-1) %>% slice_min(mae),
best_spline3 %>% select(-1)%>% slice_min(mae)
)
nama_model <- c("Polynomial2","Piecewise constant(breaks=6)",
"Natural cubic spline(df=3)"
)
data.frame(nama_model,nilai_metric)
## nama_model rmse mae
## 1 Polynomial2 4.147615 3.080702
## 2 Piecewise constant(breaks=6) 4.183642 3.108463
## 3 Natural cubic spline(df=3) 4.158586 3.083967
Dapat dilihat bahwa model terbaik yaitu polinomial dengan ordo 2 karena memiliki rmse dan mae terkecil.
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="aquamarine4") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "red",se = F)+
theme_bw()
Model polinomial ini terlihat lebih pas untuk menggambarkan data weight vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.
Link RPubs: https://rpubs.com/annissanff/tugas8sta581