Tugas Praktikum 8: Data bisa diperoleh di package ISL, silahkan install dulu packagenya. library yang digunakan

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.5
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5

Pada data Auto gunakan peubah respon mpg dan pilih tiga kolom sebagai untuk dijadikan peubah prediktor. Peubah yang dipilih yaitu displacement, horsepower, dan weight sebagai berikut.

attach(Auto)
## The following object is masked from package:ggplot2:
## 
##     mpg
head(cbind(mpg, displacement, horsepower, weight))
##      mpg displacement horsepower weight
## [1,]  18          307        130   3504
## [2,]  15          350        165   3693
## [3,]  18          318        150   3436
## [4,]  16          304        150   3433
## [5,]  17          302        140   3449
## [6,]  15          429        198   4341

Pola Hubungan Non-Linear

Berikut adalah bukti untuk hubungan non-linear antara peubah respons dan peubah prediktor yang telah dipilih.

Visualisasi displacement vs mpg

plot(displacement, mpg, pch=19, col="steelblue",
     xlab="displacement", ylab="mile per gallon")

Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.

Visualisasi horsepower vs mpg

plot(horsepower, mpg, pch=19, col="cadetblue4",
     xlab="Horse Power", ylab="mile per gallon")

Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.

Visualisasi weight vs mpg

plot(weight, mpg, pch=19, col="aquamarine4",
     xlab="weight", ylab="mile per gallon")

Dari plot di atas terlihat terdapat pola melengkung yang terbuka ke atas, sehingga membuktikan terdapat pola hubugan yang tak linear.

Model Non-Linear Terbaik

Tentukan model non-linear terbaik untuk masing-masing pasangan peubah. Pada pembahasan ini, model akan dipilih secara empiris dan diakhir akan diberikan visualisasi dari model non-linear terbaiknya.

Model Non-Linear displacement vs mpg

Regresi Polinomial

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
 metric_poly <- map_dfr(cross_val$splits,
                        function(x){
                           mod <- lm(mpg ~ poly(displacement, d = i),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           rmse <- mlr3measures::rmse(truth = truth,response = pred)
                           mae <- mlr3measures::mae(truth = truth,response = pred)
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                        } 
                        )
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
##     d     rmse      mae
## 1   1 4.599198 3.514027
## 2   2 4.323459 3.160102
## 3   3 4.333839 3.173524
## 4   4 4.351392 3.182745
## 5   5 4.360522 3.194475
## 6   6 4.335131 3.213242
## 7   7 4.282913 3.216382
## 8   8 4.231056 3.203589
## 9   9 4.187063 3.149285
## 10 10 4.144569 3.127848

dengan visualisasi polinom terbaik yaitu polinom ordo 10

 ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="steelblue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,10,raw=T), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Regresi Fungsi Tangga

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:11

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i)

mod <- lm(mpg ~ displacement,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto[-x$in_id,]

displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(displacement=displacement_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##     d     rmse      mae
## 1   2 5.919030 4.624448
## 2   3 4.888232 3.670858
## 3   4 4.803193 3.614535
## 4   5 4.674843 3.507489
## 5   6 4.605366 3.415135
## 6   7 4.559773 3.367354
## 7   8 4.388631 3.245010
## 8   9 4.250727 3.110926
## 9  10 4.227783 3.091653
## 10 11 4.355140 3.233770

dengan visualisasi fungsi tangga terbaik yaitu dengan break=10 berikut.

ggplot(Auto,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="steelblue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,10), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Regresi Spline

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 3:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
##     d     rmse      mae
## 1   3 4.339193 3.182233
## 2   4 4.358236 3.190241
## 3   5 4.318527 3.204870
## 4   6 4.239601 3.148019
## 5   7 4.188651 3.119848
## 6   8 4.178569 3.134165
## 7   9 4.153125 3.129453
## 8  10 4.142579 3.116624
## 9  11 4.115266 3.076424
## 10 12 4.106482 3.058926

Model Terbaik

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
                      best_tangga %>% select(-1) %>% slice_min(mae),
                      best_spline3 %>%  select(-1)%>% slice_min(mae)
                      )

nama_model <- c("Polynomial10","Piecewise constant(breaks=10)",
                "Natural cubic spline(df=12)"
)

data.frame(nama_model,nilai_metric)
##                      nama_model     rmse      mae
## 1                  Polynomial10 4.144569 3.127848
## 2 Piecewise constant(breaks=10) 4.227783 3.091653
## 3   Natural cubic spline(df=12) 4.106482 3.058926

Dapat dilihat bahwa model terbaik yaitu natural cubic spline dengan df=12 karena memiliki rmse dan mae terkecil.

Visualisasi Model Terbaik (Natural Cubic Spline dengan knot = (89, 97, 105, 119, 130.9167, 151, 200, 232, 275.75, 318, 351))

attr(ns(Auto$displacement, df=12),"knots")
## 8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
##   89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
##       75% 83.33333% 91.66667% 
##  275.7500  318.0000  351.0000
ggplot(Auto,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="steelblue") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(89.0000,   97.0000,  105.0000,  119.0000,  130.9167,  151.0000,  200.0000,  232.0000,  275.7500,  318.0000, 351.0000)), 
               lty = 1,se = F, col="red")

Model spline ini terlihat lebih pas untuk menggambarkan data displacement vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.

Model Non-Linear horsepower vs mpg

Regresi Polinomial

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
 metric_poly <- map_dfr(cross_val$splits,
                        function(x){
                           mod <- lm(mpg ~ poly(horsepower, d = i),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           rmse <- mlr3measures::rmse(truth = truth,response = pred)
                           mae <- mlr3measures::mae(truth = truth,response = pred)
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                        } 
                        )
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
##     d     rmse      mae
## 1   1 4.908935 3.858832
## 2   2 4.352806 3.261129
## 3   3 4.356807 3.264025
## 4   4 4.354783 3.274987
## 5   5 4.307652 3.238723
## 6   6 4.301354 3.241417
## 7   7 4.287438 3.222540
## 8   8 4.302908 3.240966
## 9   9 4.321277 3.247392
## 10 10 4.336836 3.257891

dengan visualisasi polinom terbaik yaitu polinom ordo 7

 ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="cadetblue4") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,7,raw=T), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Regresi Fungsi Tangga

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##    d     rmse      mae
## 1  2 6.122422 4.776980
## 2  3 5.766535 4.520709
## 3  4 5.020265 3.805185
## 4  5 4.706711 3.572772
## 5  6 4.658138 3.539010
## 6  7 4.572175 3.405770
## 7  8 4.438716 3.398562
## 8  9 4.498963 3.435017
## 9 10 4.569136 3.437797

dengan visualisasi fungsi tangga terbaik yaitu dengan break=8 berikut.

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="cadetblue4") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Regresi Spline

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 3:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
##     d     rmse      mae
## 1   3 4.342637 3.259033
## 2   4 4.312147 3.242972
## 3   5 4.300381 3.235026
## 4   6 4.298908 3.229285
## 5   7 4.275443 3.217522
## 6   8 4.303139 3.250454
## 7   9 4.293479 3.231677
## 8  10 4.277135 3.199777
## 9  11 4.341622 3.255280
## 10 12 4.314328 3.216725

Model Terbaik

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
                      best_tangga %>% select(-1) %>% slice_min(mae),
                      best_spline3 %>%  select(-1)%>% slice_min(mae)
                      )

nama_model <- c("Polynomial7","Piecewise constant(breaks=8)",
                "Natural cubic spline(df=10)"
)

data.frame(nama_model,nilai_metric)
##                     nama_model     rmse      mae
## 1                  Polynomial7 4.287438 3.222540
## 2 Piecewise constant(breaks=8) 4.438716 3.398562
## 3  Natural cubic spline(df=10) 4.277135 3.199777

Dapat dilihat bahwa model terbaik yaitu natural cubic spline dengan df=10 karena memiliki rmse dan mae terkecil.

Visualisasi Model Terbaik (Natural Cubic Spline dengan knot = (67, 72, 80, 88, 93.5, 100, 110, 140, 157.7))

attr(ns(Auto$horsepower, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="cadetblue4") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(67.0,  72.0,  80.0,  88.0,  93.5, 100.0, 110.0, 140.0, 157.7)), 
               lty = 1,se = F, col="red")

Model spline ini terlihat lebih pas untuk menggambarkan data horsepower vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.

Model Non-Linear weight vs mpg

Regresi Polinomial

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
d<-1:10
best_poly<-map_dfr(d, function(i){
 metric_poly <- map_dfr(cross_val$splits,
                        function(x){
                           mod <- lm(mpg ~ poly(weight, d = i),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           rmse <- mlr3measures::rmse(truth = truth,response = pred)
                           mae <- mlr3measures::mae(truth = truth,response = pred)
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                        } 
                        )
metric_poly
#menghitung rata rata untuk 10 folds
mean_metric_poly<-colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(d=d,best_poly)
best_poly
##     d     rmse      mae
## 1   1 4.306138 3.296115
## 2   2 4.147615 3.080702
## 3   3 4.156313 3.086989
## 4   4 4.161158 3.084023
## 5   5 4.165673 3.099482
## 6   6 4.182153 3.127319
## 7   7 4.179790 3.131916
## 8   8 4.208312 3.153829
## 9   9 4.268580 3.195660
## 10 10 4.234068 3.169642

Regresi Fungsi Tangga

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)

mod <- lm(mpg ~ weight,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto[-x$in_id,]

weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(weight=weight_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(d=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##    d     rmse      mae
## 1  2 5.617379 4.355730
## 2  3 4.825537 3.633163
## 3  4 4.716154 3.643745
## 4  5 4.417889 3.256344
## 5  6 4.183642 3.108463
## 6  7 4.296189 3.222659
## 7  8 4.390219 3.283052
## 8  9 4.314847 3.158501
## 9 10 4.339894 3.199744

dengan visualisasi fungsi tangga terbaik yaitu dengan break=6 berikut.

ggplot(Auto,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="aquamarine4") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,6), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Regresi Spline

set.seed(1234)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 3:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(weight,df=i),
         data=Auto[x$in_id,])
pred <- predict(mod,
               newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(d=df,best_spline3)
# menampilkan hasil all breaks
best_spline3
##     d     rmse      mae
## 1   3 4.158586 3.083967
## 2   4 4.175023 3.094981
## 3   5 4.180656 3.109785
## 4   6 4.172720 3.123068
## 5   7 4.162708 3.089706
## 6   8 4.173852 3.105423
## 7   9 4.176375 3.096280
## 8  10 4.160552 3.096405
## 9  11 4.158496 3.096378
## 10 12 4.167825 3.105770

dengan visualisasi spline terbaik yaitu dengan df=3

attr(ns(Auto$weight, df=3),"knots")
## 33.33333% 66.66667% 
##  2397.000  3333.667
ggplot(Auto,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="aquamarine4") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(2397.000,  3333.667)), 
               lty = 1,se = F, col="red")

Model Terbaik

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(mae),
                      best_tangga %>% select(-1) %>% slice_min(mae),
                      best_spline3 %>%  select(-1)%>% slice_min(mae)
                      )

nama_model <- c("Polynomial2","Piecewise constant(breaks=6)",
                "Natural cubic spline(df=3)"
)

data.frame(nama_model,nilai_metric)
##                     nama_model     rmse      mae
## 1                  Polynomial2 4.147615 3.080702
## 2 Piecewise constant(breaks=6) 4.183642 3.108463
## 3   Natural cubic spline(df=3) 4.158586 3.083967

Dapat dilihat bahwa model terbaik yaitu polinomial dengan ordo 2 karena memiliki rmse dan mae terkecil.

Visualisasi Model Terbaik (Polinomial ordo 2)

 ggplot(Auto,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="aquamarine4") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Model polinomial ini terlihat lebih pas untuk menggambarkan data weight vs mpg dibandingkan dengan visualisasi data model lain yang sudah dicantumkan sebelumnya.

Link RPubs: https://rpubs.com/annissanff/tugas8sta581