Moving Beyond Linearity

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(splines)
library(rsample)
library(ISLR)

A. Data MPG DAN Horsepower

#input data
library(ISLR)
data <- Auto
datampg <- data %>% select(mpg, horsepower)
head(datampg)

##   mpg horsepower
## 1  18        130
## 2  15        165
## 3  18        150
## 4  16        150
## 5  17        140
## 6  15        198

melihat sebaran data di scatter plot antara variabel respon dan variabel penjelas

ggplot(datampg,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  theme_bw()

berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline.

regresi polynomial

set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")

df <- 2:4

best_poly <- map_dfr(df, function(i){

metric_poly2 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,df = i),
         data=datampg[x$in_id,])
pred <- predict(mod,
               newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)

mean_metric_poly2

}
)

best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly

##   df     rmse      mae
## 1  2 4.351129 3.266460
## 2  3 4.355430 3.266945
## 3  4 4.367009 3.280452

#berdasarkan rmse
best_poly %>% slice_min(rmse)

##   df     rmse     mae
## 1  2 4.351129 3.26646

#berdasarkan mae
best_poly %>% slice_min(mae)

##   df     rmse     mae
## 1  2 4.351129 3.26646

model_poly_df2 <- best_poly %>% slice_min(mae)

regresi spline

set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")

df <- 3:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(horsepower,df=i),
         data=datampg[x$in_id,])
pred <- predict(mod,
               newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$horsepower

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

## Warning in bs(horsepower, degree = 3L, knots = numeric(0), Boundary.knots =
## c(46, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`50%` = 92.5), Boundary.knots =
## c(46, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`33.33333%` = 84, `66.66667%` =
## 110: some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`25%` = 75, `50%` = 92.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`20%` = 72, `40%` = 88, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`16.66667%` = 70, `33.33333%` =
## 84, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`14.28571%` = 68, `28.57143%`
## = 78.8571428571428, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(horsepower, degree = 3L, knots = c(`12.5%` = 67, `25%` = 75, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  3 92.41658 81.02794
## 2  4 92.42895 81.03525
## 3  5 92.42668 81.03694
## 4  6 92.42253 81.03160
## 5  7 92.42530 81.02817
## 6  8 92.43221 81.03391
## 7  9 92.43452 81.03576
## 8 10 92.42940 81.02872

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1  3 92.41658 81.02794

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1  3 92.41658 81.02794

model_spline_df10 <- best_spline3 %>% slice_min(mae)

fungsi tangga

# cross validation
set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- datampg[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- datampg[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##   df     rmse      mae
## 1  2 6.147626 4.790234
## 2  3 5.775792 4.521829
## 3  4 4.985270 3.774549
## 4  5 4.712623 3.571614
## 5  6 4.644383 3.548375
## 6  7 4.554116 3.379980
## 7  8 4.437597 3.405433
## 8  9 4.549668 3.471999
## 9 10 4.589172 3.455531

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   df     rmse      mae
## 1  8 4.437597 3.405433

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   df     rmse     mae
## 1  7 4.554116 3.37998

model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)

regresi natural spline

set.seed(123)
cross_val <- vfold_cv(datampg,v=10,strata = "mpg")

df <- 3:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=datampg[x$in_id,])
pred <- predict(mod,
               newdata=datampg[-x$in_id,])
truth <- datampg[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  3 4.348006 3.261538
## 2  4 4.329461 3.257926
## 3  5 4.313312 3.246412
## 4  6 4.310011 3.237990
## 5  7 4.294671 3.229174
## 6  8 4.293369 3.232221
## 7  9 4.281877 3.211860
## 8 10 4.260316 3.192010

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse     mae
## 1 10 4.260316 3.19201

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse     mae
## 1 10 4.260316 3.19201

model_spline_df10n <- best_spline3 %>% slice_min(mae)

perbandingan model

nilai_metric <- rbind(model_poly_df2,
                      model_tangga_k7,
                      model_tangga_k8,
                      model_spline_df10,
                      model_spline_df10n) 
nama_model <- c("Regresi Polynomial (df=2)",
                "Fungsi Tangga (k=7)",
                "Fungsi Tangga (k=8)",
                "Regresi Spline (df=3)",
                "Regresi Natural Spline (df=10)"
                )

data.frame(nama_model,nilai_metric)

##                       nama_model df      rmse       mae
## 1      Regresi Polynomial (df=2)  2  4.351129  3.266460
## 2            Fungsi Tangga (k=7)  7  4.554116  3.379980
## 3            Fungsi Tangga (k=8)  8  4.437597  3.405433
## 4          Regresi Spline (df=3)  3 92.416576 81.027940
## 5 Regresi Natural Spline (df=10) 10  4.260316  3.192010

berikut bentuk kurva model terbaik untuk masing - masing metode . polynom df=2

  ggplot(datampg,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Fungsi Tangga (k=7)

ggplot(datampg,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Fungsi Tangga (k=8)

ggplot(datampg,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Spline (df=3)

ggplot(datampg,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~bs(x,3), 
               lty = 1,se=F)

Regresi Natural Spline (df=10)

ggplot(datampg,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~ns(x,10), 
               lty = 1,se=F)

Berdasarkan hasil analisis model Regresi Polynomial (df=2) merupakan model terbaik karena memiliki nilai RMSE dan MAE yang cukup kecil dan bentuk plotnya yang lebih mulus dibandingkan dengan model-model yang lain

B. Data MPG dan WEIGHT

#input data
library(ISLR)
data <- Auto
dataw <- data %>% select(mpg, weight)
head(dataw)

##   mpg weight
## 1  18   3504
## 2  15   3693
## 3  18   3436
## 4  16   3433
## 5  17   3449
## 6  15   4341

melihat sebaran data di scatter plot

ggplot(dataw,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  theme_bw()

berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline, dan regresi natural spline.

regresi polynomial

set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")

df <- 2:4

best_poly <- map_dfr(df, function(i){

metric_poly2 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(weight,df = i),
         data=dataw[x$in_id,])
pred <- predict(mod,
               newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)

mean_metric_poly2

}
)

best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly

##   df     rmse      mae
## 1  2 3074.796 2954.365
## 2  3 3074.796 2954.367
## 3  4 3074.796 2954.368

#berdasarkan rmse
best_poly %>% slice_min(rmse)

##   df     rmse      mae
## 1  4 3074.796 2954.368

#berdasarkan mae
best_poly %>% slice_min(mae)

##   df     rmse      mae
## 1  2 3074.796 2954.365

model_poly_df2 <- best_poly %>% slice_min(mae)

fungsi tangga

# cross validation
set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- dataw[x$in_id,]
training$weight <- cut(training$weight,i)

mod <- lm(mpg ~ weight,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dataw[-x$in_id,]

weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(weight=weight_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##   df     rmse      mae
## 1  2 5.570088 4.345368
## 2  3 4.868847 3.657505
## 3  4 4.706593 3.633584
## 4  5 4.389515 3.221556
## 5  6 4.201264 3.112221
## 6  7 4.279395 3.191726
## 7  8 4.370964 3.249765
## 8  9 4.341743 3.182396
## 9 10 4.329114 3.176390

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   df     rmse      mae
## 1  6 4.201264 3.112221

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   df     rmse      mae
## 1  6 4.201264 3.112221

model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)

regresi spline

set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")

df <- 3:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(weight,df=i),
         data=dataw[x$in_id,])
pred <- predict(mod,
               newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

## Warning in bs(weight, degree = 3L, knots = numeric(0), Boundary.knots =
## c(1613, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = numeric(0), Boundary.knots =
## c(1649, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`50%` = 2792.5), Boundary.knots =
## c(1613, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`50%` = 2835), Boundary.knots =
## c(1649, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`33.33333%` = 2391, `66.66667%` =
## 3302: some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`33.33333%` = 2406, `66.66667%`
## = 3380.66666666667: some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2227.5, `50%` = 2792.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2234, `50%` = 2835, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2155, `40%` = 2577.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2166.8, `40%` = 2604, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2125, `33.33333%` =
## 2391, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2130, `33.33333%` =
## 2406, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2076.42857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2102.28571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2049.375, `25%` =
## 2227.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2051, `25%` = 2234, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  3 3074.796 2954.367
## 2  4 3074.795 2954.368
## 3  5 3074.793 2954.365
## 4  6 3074.798 2954.371
## 5  7 3074.797 2954.372
## 6  8 3074.800 2954.372
## 7  9 3074.798 2954.367
## 8 10 3074.787 2954.355

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1 10 3074.787 2954.355

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1 10 3074.787 2954.355

model_spline_df10 <- best_spline3 %>% slice_min(mae)

regresi natural spline

set.seed(123)
cross_val <- vfold_cv(dataw,v=10,strata = "mpg")

df <- 3:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(weight,df=i),
         data=dataw[x$in_id,])
pred <- predict(mod,
               newdata=dataw[-x$in_id,])
truth <- dataw[-x$in_id,]$weight

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  3 3074.798 2954.369
## 2  4 3074.795 2954.367
## 3  5 3074.793 2954.365
## 4  6 3074.797 2954.367
## 5  7 3074.796 2954.368
## 6  8 3074.789 2954.362
## 7  9 3074.794 2954.365
## 8 10 3074.786 2954.358

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1 10 3074.786 2954.358

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1 10 3074.786 2954.358

model_spline_df10n <- best_spline3 %>% slice_min(mae)

perbandingan model

nilai_metric <- rbind(model_poly_df2,
                      model_tangga_k7,
                      model_spline_df10,
                      model_spline_df10n) 
nama_model <- c("Regresi Polynomial (df=2)",
                "Fungsi Tangga (k=6)",
                "Regresi spline (df=10)",
                "Regresi Natural Spline (df=10)"
                )

data.frame(nama_model,nilai_metric)

##                       nama_model df        rmse         mae
## 1      Regresi Polynomial (df=2)  2 3074.796402 2954.365281
## 2            Fungsi Tangga (k=6)  6    4.201264    3.112221
## 3         Regresi spline (df=10) 10 3074.787064 2954.354640
## 4 Regresi Natural Spline (df=10) 10 3074.785702 2954.357867

bentuk kurvanya sebagai berikut. polynom df=2

  ggplot(dataw,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Fungsi Tangga (k=6)

ggplot(dataw,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,6), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Spline (df=8)

ggplot(dataw,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~bs(x,8), 
               lty = 1,se=F)

Regresi Naturan Spline (df=10)

ggplot(dataw,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~ns(x,10), 
               lty = 1,se=F)

berdasarkan hasil analisis model dari fungsi tangga dengan nilai knots = 6 merupakan model terbaik karena memiliki nilai RMSE dan MAE terkecil dan bentuk dari kurvanya cenderung lebih mulus

C. Data MPG dan Displecement

#input data
library(ISLR)
data <- Auto
datad <- data %>% select(mpg, displacement)
head(datad)

##   mpg displacement
## 1  18          307
## 2  15          350
## 3  18          318
## 4  16          304
## 5  17          302
## 6  15          429

melihat sebaran data di scatter plot

ggplot(datad,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  theme_bw()

berdasarkan data terlihat hubungan antara x dan y tidak linear, maka akan dicoba menggunakan regresi polynomial, fungsi tangga, dan regresi spline.

regresi polynomial

set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")

df <- 2:4

best_poly <- map_dfr(df, function(i){

metric_poly2 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(displacement,df = i),
         data=datad[x$in_id,])
pred <- predict(mod,
               newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)

mean_metric_poly2

}
)

best_poly <- cbind(df=df,best_poly)
# menampilkan hasil all breaks
best_poly

##   df     rmse      mae
## 1  2 4.283164 3.159670
## 2  3 4.284465 3.165588
## 3  4 4.288491 3.164067

#berdasarkan rmse
best_poly %>% slice_min(rmse)

##   df     rmse     mae
## 1  2 4.283164 3.15967

#berdasarkan mae
best_poly %>% slice_min(mae)

##   df     rmse     mae
## 1  2 4.283164 3.15967

model_poly_df2 <- best_poly %>% slice_min(mae)

fungsi tangga

# cross validation
set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- datad[x$in_id,]
training$displacement <- cut(training$displacement,i)

mod <- lm(mpg ~ displacement,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- datad[-x$in_id,]

displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(displacement=displacement_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(df=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##   df     rmse      mae
## 1  2 5.915956 4.627152
## 2  3 4.865081 3.676866
## 3  4 4.813915 3.619919
## 4  5 4.661119 3.496662
## 5  6 4.561127 3.389387
## 6  7 4.552850 3.372940
## 7  8 4.363903 3.230513
## 8  9 4.218350 3.109206
## 9 10 4.194571 3.097485

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   df     rmse      mae
## 1 10 4.194571 3.097485

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   df     rmse      mae
## 1 10 4.194571 3.097485

model_tangga_k7 <- best_tangga %>% slice_min(mae)
model_tangga_k8 <- best_tangga %>% slice_min(rmse)

regresi spline

set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")

df <- 4:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(displacement,df=i),
         data=datad[x$in_id,])
pred <- predict(mod,
               newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$displacement

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

## Warning in bs(displacement, degree = 3L, knots = c(`50%` = 148.5),
## Boundary.knots = c(70, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`33.33333%` = 119,
## `66.66667%` = 232: some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`25%` = 104.75, `50%` =
## 148.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`20%` = 98, `40%` = 121.4, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`16.66667%` = 97, `33.33333%`
## = 119, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`14.28571%` = 97, `28.57143%`
## = 108.571428571429, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(displacement, degree = 3L, knots = c(`12.5%` = 91, `25%` =
## 104.75, : some 'x' values beyond boundary knots may cause ill-conditioned bases

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  4 203.4562 170.8607
## 2  5 203.4477 170.8566
## 3  6 203.4635 170.8661
## 4  7 203.4620 170.8728
## 5  8 203.4630 170.8735
## 6  9 203.4628 170.8710
## 7 10 203.4672 170.8799

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1  5 203.4477 170.8566

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1  5 203.4477 170.8566

model_spline_df10 <- best_spline3 %>% slice_min(mae)

regresi natural spline

set.seed(123)
cross_val <- vfold_cv(datad,v=10,strata = "mpg")

df <- 3:10

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
         data=datad[x$in_id,])
pred <- predict(mod,
               newdata=datad[-x$in_id,])
truth <- datad[-x$in_id,]$displacement

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  3 203.4609 170.8690
## 2  4 203.4506 170.8558
## 3  5 203.4583 170.8610
## 4  6 203.4630 170.8632
## 5  7 203.4539 170.8633
## 6  8 203.4566 170.8635
## 7  9 203.4734 170.8807
## 8 10 203.4635 170.8657

#berdasarkan rmse
best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1  4 203.4506 170.8558

#berdasarkan mae
best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1  4 203.4506 170.8558

model_spline_df10n <- best_spline3 %>% slice_min(mae)

perbandingan model

nilai_metric <- rbind(model_poly_df2,
                      model_tangga_k7,
                      model_spline_df10,
                      model_spline_df10n) 
nama_model <- c("Regresi Polynomial (df=2)",
                "Fungsi Tangga (k=10)",
                "Regresi spline (df=5)",
                "Regresi Natural Spline (df=4)"
                )

data.frame(nama_model,nilai_metric)

##                      nama_model df       rmse        mae
## 1     Regresi Polynomial (df=2)  2   4.283164   3.159670
## 2          Fungsi Tangga (k=10) 10   4.194571   3.097485
## 3         Regresi spline (df=5)  5 203.447721 170.856585
## 4 Regresi Natural Spline (df=4)  4 203.450620 170.855830

berikut bentuk kurva model terbaik untuk masing - masing metode . polynom df=2

  ggplot(datad,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Fungsi Tangga (k=10)

ggplot(datad,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,10), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Spline (df=5)

ggplot(datad,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~bs(x,5), 
               lty = 1,se=F)

Regresi Natural Spline (df=4)

ggplot(datad,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="black")+
    stat_smooth(method = "lm", 
               formula = y~ns(x,4), 
               lty = 1,se=F)

berdasarkan hasil analisis model, model dari regresi polinomial dengan df=2 merupakan model terbaik karena memiliki nilai RMSE dan MAE yang kecil dan bentuk dari kurvanya cenderung lebih mulus.