Praktikum STA581

Tugas Pert. 8

Library

Berikut adalah daftar library() yang digunakan.

library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(MultiKink)
library(splines)
library(rsample)
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Data

attach(Auto)
## The following object is masked from package:ggplot2:
## 
##     mpg
head(cbind(mpg, horsepower,acceleration,weight))
##      mpg horsepower acceleration weight
## [1,]  18        130         12.0   3504
## [2,]  15        165         11.5   3693
## [3,]  18        150         11.0   3436
## [4,]  16        150         12.0   3433
## [5,]  17        140         10.5   3449
## [6,]  15        198         10.0   4341

Fungsi

# Fungsi untuk 3 model

# Fungsi CV Polynomial Regression
CVPoly <- function(Data, pangkat, k){
cross_val <- vfold_cv(Data, v=k)

metric_poly <- function(x=cross_val$splits, pangkat){
  result <- data.frame()
  
  for(foldth in 1:k){
    x_data <- x[[foldth]]
    
    # 1. memodelkan data latih cv fold ke-k
    mod <- lm(mpg ~ poly(exp, pangkat, raw = T), data=Data[x_data$in_id,])
    
    # 2. predict data uji ke-k
    pred <- predict(mod, newdata=Data[-x_data$in_id,])
    
    # 3. data uji 
    truth <- Data[-x_data$in_id,]$mpg
    
    # 4. hitung rmse
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    
    # 5. hitung mae
    #mae <- mlr3measures::mae(truth = truth, response = pred)
    
    # 6. hitung aic
    aic <- AIC(mod)
    
    # 7. add rmse and aic to vector
    metric <- data.frame(RMSE = rmse, AIC = aic)
    
    result <- rbind(result,metric)
  }
  
  return(colMeans(result))
  
}
Polinomial <- data.frame()
poli <- data.frame()
for(i in pangkat){
  poli <- data.frame(RMSE = metric_poly(cross_val$splits,i)[1],
                     AIC = metric_poly(cross_val$splits,i)[2],
                     row.names = NULL)
  Polinomial <- rbind(Polinomial,poli)
}
Polinomial <- cbind(pangkat, Polinomial)
return(Polinomial)
}

# Fungsi CV Piecewise Constant
CVPieCons <- function(Data, breaks, k){
  cross_val <- vfold_cv(Data, v=k)
  
  best_tangga <- map_dfr(breaks, function(i) {
    metric_tangga <- map_dfr(cross_val$splits, function(x){
      training <- Data[x$in_id,]
      training$exp <- cut(training$exp, i)
      
      mod <- lm(mpg ~ exp, data=training)
      
      labs_x <- levels(mod$model[,2])
      
      labs_x_breaks <- cbind(
        lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
        upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x))
      )
      
      testing <- Data[-x$in_id,]
      exp_new <- cut(testing$exp, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
      
      pred <- predict(mod, newdata=list(exp=exp_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      
      RMSE <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
      AIC <- AIC(mod)
      #mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(RMSE,AIC)
      names(metric) <- c("RMSE","AIC")
      return(metric)
    })
    
    # menghitung rata-rata untuk 10 folds
    mean_metric_tangga <- colMeans(metric_tangga)
    return(mean_metric_tangga)
  })
  best_tangga <- cbind(breaks=breaks,best_tangga)
  return(best_tangga)
}
  
# Fungsi CV Natural Cubic Splines
CVNatCubicSpl <- function(Data, df, k){
  cross_val <- vfold_cv(Data, v=k)
  best_spline3 <- map_dfr(df, function(i){
    metric_spline3 <- map_dfr(cross_val$splits, function(x){
      mod <- lm(mpg ~ ns(exp, df=i), data=Data[x$in_id,])
      pred <- predict(mod, newdata=Data[-x$in_id,])
      truth <- Data[-x$in_id,]$mpg
      
      RMSE <- mlr3measures::rmse(truth = truth, response = pred)
      #mae <- mlr3measures::mae(truth = truth, response = pred)
      AIC <- AIC(mod)
      #mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(RMSE,AIC)
      names(metric) <- c("RMSE","AIC")
      return(metric)
    })
    # menghitung rata-rata untuk 10 folds
    mean_metric_spline3 <- colMeans(metric_spline3)
    return(mean_metric_spline3)
  })
  best_spline3 <- cbind(df=df, best_spline3)
  return(best_spline3)
}

mpg vs horsepower

Data

Auto_h <- data.frame()
mpg <- Auto$mpg
exp <- Auto$horsepower
Auto_h <- data.frame(mpg,exp)
head(Auto_h)
##   mpg exp
## 1  18 130
## 2  15 165
## 3  18 150
## 4  16 150
## 5  17 140
## 6  15 198

Plot

# Plot Data mpg vs horsepower
plot(Auto$horsepower, Auto$mpg, pch=19, col="black",
     xlab="horsepower", ylab="mile per gallon")

Pemodelan

set.seed(123)

# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_h, 2:8, 10)
Polinomial
##   pangkat     RMSE      AIC
## 1       2 4.328239 2047.195
## 2       3 4.328164 2048.395
## 3       4 4.327918 2049.059
## 4       5 4.281743 2042.481
## 5       6 4.274428 2040.807
## 6       7 4.255062 2039.645
## 7       8 4.267810 2041.321
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])
##   pangkat     RMSE      AIC
## 1       7 4.255062 2039.645
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])
##   pangkat     RMSE      AIC
## 1       7 4.255062 2039.645

Berdasarkan nilai RMSE dan AIC terkecil, maka model Polynomial Regression yang paling baik adalah model dengan derajat 7. Namun terlihat pada plot model dengan derajat 7 terdapat kurva yang tidak teratur pada bagian ujung-ujung. Selain itu, perbedaan RMSE dan AIC antar derajat polinomial yang digunakan tidak jauh berbeda. Oleh karena itu, dipilih model terbaik yang memiliki RMSE dan AIC terkecil dengan kurva yang teratur pada bagian ujung-ujungnya, yaitu model dengan derajat 4.

BestPoly.h <- Polinomial[Polinomial$pangkat==4,2:3]
BestPoly.h
##       RMSE      AIC
## 3 4.327918 2049.059
p.poly.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 7), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "Polynomial d = 7") +
  theme_bw()
p.polybest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 4), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "Polynomial d = 4") +
  theme_bw()
grid.arrange(p.poly.h, p.polybest.h, ncol=2)

set.seed(123)

# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_h, 3:10, 10)
PieCons
##   breaks     RMSE      AIC
## 1      3 5.740349 2245.134
## 2      4 4.943248 2143.313
## 3      5 4.688531 2103.087
## 4      6 4.642824 2097.658
## 5      7 4.513490 2081.595
## 6      8 4.394890 2063.486
## 7      9 4.525875 2080.801
## 8     10 4.543401 2084.919
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])
##   breaks    RMSE      AIC
## 1      8 4.39489 2063.486
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])
##   breaks    RMSE      AIC
## 1      8 4.39489 2063.486

Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 8.

BestPie.h <- PieCons[PieCons$breaks==8,2:3]
BestPie.h
##      RMSE      AIC
## 6 4.39489 2063.486
p.pieconsbest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ cut(x, 8), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "PC breaks = 8") +
  theme_bw()
p.pieconsbest.h

set.seed(123)

# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_h, 2:11, 10)
NatCubicSpl
##    df     RMSE      AIC
## 1   2 4.307281 2043.990
## 2   3 4.316587 2046.246
## 3   4 4.291779 2041.820
## 4   5 4.274236 2039.912
## 5   6 4.283307 2039.328
## 6   7 4.250278 2035.696
## 7   8 4.273412 2037.512
## 8   9 4.253688 2036.594
## 9  10 4.225783 2034.172
## 10 11 4.271550 2038.978
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])
##   df     RMSE      AIC
## 1 10 4.225783 2034.172
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])
##   df     RMSE      AIC
## 1 10 4.225783 2034.172
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_h$exp, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7

Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 10 atau knot = (67, 72, 80, 88, 93.5, 100, 110, 140, 157.7).

p.natcubicspl.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, df = 10), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "Natural Cubic Splines df = 10") +
  theme_bw()
p.natcubicspl.h

Namun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots yang sudah diubah dan dikurangi berdasarkan knots dari df = 10 yang terbaik adalah model dengan knots = c(85.00,105.00,180.00). Kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10.

p.natcubicspl.h1 <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(80, 100, 110, 157.7)), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "NcS Knots 80, 100, 110, 157.7") +
  theme_bw()
p.natcubicsplbest.h <- ggplot(Auto_h, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(85.00,105.00,180.00)), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs horsepower", subtitle = "Knots 85.00,105.00,180.00") +
  theme_bw()
grid.arrange(p.natcubicspl.h1, p.natcubicsplbest.h, ncol=2)

set.seed(123)
cross_valh <- vfold_cv(Auto_h, v=10)
metric_spline3 <- map_dfr(cross_valh$splits, function(x){
      mod <- lm(mpg ~ ns(exp, knots = c(85.00,105.00,180.00)), data=Auto_h[x$in_id,])
      pred <- predict(mod, newdata=Auto_h[-x$in_id,])
      truth <- Auto_h[-x$in_id,]$mpg
      
      RMSE <- mlr3measures::rmse(truth = truth, response = pred)
      #mae <- mlr3measures::mae(truth = truth, response = pred)
      AIC <- AIC(mod)
      #mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(RMSE,AIC)
      names(metric) <- c("RMSE","AIC")
      return(metric)
    })
(Best3.h <- (mean_metric_spline3 <- colMeans(metric_spline3)))
##        RMSE         AIC 
##    4.318277 2047.706586

mpg vs acceleration

Data

Auto_d <- data.frame()
mpg <- Auto$mpg
exp <- Auto$displacement
Auto_d <- data.frame(mpg,exp)
head(Auto_d)
##   mpg exp
## 1  18 307
## 2  15 350
## 3  18 318
## 4  16 304
## 5  17 302
## 6  15 429

Plot

# Plot Data mpg vs acceleration
plot(Auto$displacement, Auto$mpg, pch=19, col="black",
     xlab="discplacement", ylab="mile per gallon")

Pemodelan

set.seed(123)

# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_d, 2:10, 10)
Polinomial
##   pangkat     RMSE      AIC
## 1       2 4.268993 2045.484
## 2       3 4.280184 2046.406
## 3       4 4.297514 2048.227
## 4       5 4.313349 2049.587
## 5       6 4.299478 2046.224
## 6       7 4.263486 2039.280
## 7       8 4.212933 2032.596
## 8       9 4.155368 2025.211
## 9      10 4.115678 2020.407
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])
##   pangkat     RMSE      AIC
## 1      10 4.115678 2020.407
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])
##   pangkat     RMSE      AIC
## 1      10 4.115678 2020.407

Berdasarkan nilai RMSE dan AIC terkecil, maka model Polynomial Regression yang paling baik adalah model dengan derajat 10. Namun terlihat pada plot model dengan derajat 10 terdapat bagian kurva yang tidak teratur. Selain itu, perbedaan RMSE dan AIC antar derajat polinomial yang digunakan tidak jauh berbeda. Oleh karena itu, dipilih model terbaik yang memiliki RMSE dan AIC terkecil dengan bentuk kurva yang teratur, yaitu model dengan derajat 4.

BestPoly.d <- Polinomial[Polinomial$pangkat==4,2:3]
BestPoly.d
##       RMSE      AIC
## 3 4.297514 2048.227
p.polybest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 4), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "Polynomial d = 4") +
  theme_bw()
p.poly.d2 <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 5), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "Polynomial d = 5") +
  theme_bw()
p.poly.d3 <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 6), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "Polynomial d = 6") +
  theme_bw()
grid.arrange(p.polybest.d, p.poly.d2, p.poly.d3, ncol=3)

set.seed(123)

# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_d, 3:10, 10)
PieCons
##   breaks     RMSE      AIC
## 1      3 4.852760 2134.269
## 2      4 4.773902 2129.232
## 3      5 4.601289 2101.715
## 4      6 4.538270 2097.144
## 5      7 4.523365 2087.522
## 6      8 4.342332 2062.307
## 7      9 4.209561 2032.645
## 8     10 4.262702 2035.780
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])
##   breaks     RMSE      AIC
## 1      9 4.209561 2032.645
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])
##   breaks     RMSE      AIC
## 1      9 4.209561 2032.645

Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 9.

BestPie.d <- PieCons[PieCons$breaks==9,2:3]
BestPie.d
##       RMSE      AIC
## 7 4.209561 2032.645
p.pieconsbest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ cut(x, 9), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "PC breaks = 9") +
  theme_bw()
p.pieconsbest.d

set.seed(123)

# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_d, 2:15, 10)
NatCubicSpl
##    df     RMSE      AIC
## 1   2 4.265964 2044.248
## 2   3 4.284071 2045.618
## 3   4 4.319907 2047.561
## 4   5 4.296884 2041.463
## 5   6 4.213630 2028.309
## 6   7 4.160873 2023.425
## 7   8 4.156561 2022.296
## 8   9 4.129745 2018.862
## 9  10 4.126147 2016.601
## 10 11 4.083069 2009.308
## 11 12 4.061449 2008.255
## 12 13 4.077891 2011.270
## 13 14 4.111637 2014.261
## 14 15 4.119116 2012.548
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])
##   df     RMSE      AIC
## 1 12 4.061449 2008.255
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])
##   df     RMSE      AIC
## 1 12 4.061449 2008.255
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_d$exp, df=12),"knots")
## 8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
##   89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
##       75% 83.33333% 91.66667% 
##  275.7500  318.0000  351.0000

Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 12 atau knot = (89.0000, 97.0000, 105.0000, 119.0000, 130.9167, 151.0000, 200.0000, 232.0000, 275.7500, 318.0000, 351.0000).

p.natcubicspl.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, df = 12), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "Natural Cubic Splines df = 12") +
  theme_bw()
p.natcubicspl.d

Namun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots = c(120,140,231.15,400), kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10. Jadi, model terbaik yang dipilih adalah model dengan knots = c(120,140,231.15,400).

p.natcubicspl.d1 <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(120,140,151,351)), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "NCS Knots 120, 140, 151, 351") +
  theme_bw()
p.natcubicsplbest.d <- ggplot(Auto_d, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(120,140,231.15,400)), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs displacement", subtitle = "NCS Knots 120 140 231.15 400") +
  theme_bw()
grid.arrange(p.natcubicspl.d1, p.natcubicsplbest.d, ncol=2)

set.seed(123)
cross_vald <- vfold_cv(Auto_d, v=10)
metric_spline3 <- map_dfr(cross_vald$splits, function(x){
      mod <- lm(mpg ~ ns(exp, knots = c(120,140,231.15,400)), data=Auto_d[x$in_id,])
      pred <- predict(mod, newdata=Auto_d[-x$in_id,])
      truth <- Auto_d[-x$in_id,]$mpg
      
      RMSE <- mlr3measures::rmse(truth = truth, response = pred)
      #mae <- mlr3measures::mae(truth = truth, response = pred)
      AIC <- AIC(mod)
      #mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(RMSE,AIC)
      names(metric) <- c("RMSE","AIC")
      return(metric)
    })
(Best3.d <- (mean_metric_spline3 <- colMeans(metric_spline3)))
##       RMSE        AIC 
##    4.31826 2048.79815

mpg vs weight

Data

Auto_w <- data.frame()
mpg <- Auto$mpg
exp <- Auto$weight
Auto_w <- data.frame(mpg,exp)
head(Auto_w)
##   mpg  exp
## 1  18 3504
## 2  15 3693
## 3  18 3436
## 4  16 3433
## 5  17 3449
## 6  15 4341

Plot

# Plot Data mpg vs weight
plot(Auto$weight, Auto$mpg, pch=19, col="black",
     xlab="weight", ylab="mile per gallon")

Pemodelan

set.seed(123)

# Hasil Polynomial Regression
Polinomial <- CVPoly(Auto_w, 2:8, 10)
Polinomial
##   pangkat     RMSE      AIC
## 1       2 4.083225 2014.527
## 2       3 4.086215 2016.491
## 3       4 4.096702 2017.995
## 4       5 4.110388 2019.112
## 5       6 4.121985 2020.423
## 6       7 4.115721 2020.775
## 7       8 4.125211 2022.722
(Best_RMSE <- (Polinomial %>% arrange(RMSE))[1,])
##   pangkat     RMSE      AIC
## 1       2 4.083225 2014.527
(Best_AIC <- (Polinomial %>% arrange(AIC))[1,])
##   pangkat     RMSE      AIC
## 1       2 4.083225 2014.527

Berdasarkan nilai RMSE dan AIC terkecil serta hasil visualisasi model polinomial dengan derajat 2, maka model Polynomial Regression yang paling baik adalah model dengan derajat 2.

BestPoly.w <- Polinomial[Polinomial$pangkat==2,2:3]
BestPoly.w
##       RMSE      AIC
## 1 4.083225 2014.527
p.polybest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ poly(x, 2), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs weight", subtitle = "Polynomial d = 2") +
  theme_bw()
p.polybest.w

set.seed(123)

# Hasil Piecewise Constant
PieCons <- CVPieCons(Auto_w, 3:10, 10)
PieCons
##   breaks     RMSE      AIC
## 1      3 4.831174 2122.146
## 2      4 4.668478 2096.702
## 3      5 4.356145 2055.344
## 4      6 4.172338 2023.892
## 5      7 4.246013 2038.817
## 6      8 4.312464 2053.980
## 7      9 4.263321 2042.495
## 8     10 4.261454 2047.200
(Best_RMSE <- (PieCons %>% arrange(RMSE))[1,])
##   breaks     RMSE      AIC
## 1      6 4.172338 2023.892
(Best_AIC <- (PieCons %>% arrange(AIC))[1,])
##   breaks     RMSE      AIC
## 1      6 4.172338 2023.892

Berdasarkan nilai RMSE dan AIC terkecil, serta dilihat dari bentuk kurvanya, maka model terbaik adalah Piecewise Constant dengan breaks = 6.

BestPie.w <- PieCons[PieCons$breaks==6,2:3]
BestPie.w
##       RMSE      AIC
## 4 4.172338 2023.892
p.pieconsbest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ cut(x, 6), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs weight", subtitle = "PC breaks = 6") +
  theme_bw()
p.pieconsbest.w

set.seed(123)

# Hasil Natural Cubic Splines
NatCubicSpl <- CVNatCubicSpl(Auto_w, 2:10, 10)
NatCubicSpl
##   df     RMSE      AIC
## 1  2 4.085148 2014.337
## 2  3 4.088267 2016.159
## 3  4 4.112470 2017.890
## 4  5 4.117601 2019.397
## 5  6 4.109858 2016.819
## 6  7 4.094677 2013.657
## 7  8 4.105949 2017.100
## 8  9 4.114397 2018.312
## 9 10 4.114696 2017.134
(Best_RMSE <- (NatCubicSpl %>% arrange(RMSE))[1,])
##   df     RMSE      AIC
## 1  2 4.085148 2014.337
(Best_AIC <- (NatCubicSpl %>% arrange(AIC))[1,])
##   df     RMSE      AIC
## 1  7 4.094677 2013.657
# Ordinary knots yang ditentukan oleh komputer
attr(ns(Auto_w$exp, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286

Berdasarkan nilai RMSE dan AIC terkecil, maka dipilih model Natural Cubic Spline yang paling baik adalah model dengan df = 7 atau knot = (2074.857, 2285.429, 2635.000, 2986.571, 3446.143, 4096.286).

p.natcubicspl.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, df = 7), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs weight", subtitle = "NCS df = 7") +
  theme_bw()
p.natcubicspl.w

Namun, dari hasil visualisasi data, ternyata terdapat bagian kurva yang tampak tidak teratur. Berdasarkan visualisasi model menggunakan knots = c(120,140,231.15,400), kurva yang terbentuk lebih teratur dibandingkan dengan kurva model dengan df = 10. Jadi, model terbaik yang dipilih adalah model dengan knots = c(120,140,231.15,400).

p.natcubicsplbest.w <- ggplot(Auto_w, aes(x=exp, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", formula = y ~ ns(x, knots = c(2285.429, 2635.000, 2986.571, 3446.143, 4096.286)), lty = 1, col = "blue",se = F) +
  labs(title = "mpg vs weight", subtitle = "NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286") +
  theme_bw()
p.natcubicsplbest.w

set.seed(123)
cross_valw <- vfold_cv(Auto_w, v=10)
metric_spline3 <- map_dfr(cross_valw$splits, function(x){
      mod <- lm(mpg ~ ns(exp, knots = c(2285.429, 2635.000, 2986.571, 3446.143, 4096.286)), data=Auto_w[x$in_id,])
      pred <- predict(mod, newdata=Auto_w[-x$in_id,])
      truth <- Auto_w[-x$in_id,]$mpg
      
      RMSE <- mlr3measures::rmse(truth = truth, response = pred)
      #mae <- mlr3measures::mae(truth = truth, response = pred)
      AIC <- AIC(mod)
      #mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(RMSE,AIC)
      names(metric) <- c("RMSE","AIC")
      return(metric)
    })
(Best3.w <- (mean_metric_spline3 <- colMeans(metric_spline3)))
##        RMSE         AIC 
##    4.126551 2021.601414

Perbandingan Model

mpg vs horsepower

grid.arrange(p.polybest.h, p.pieconsbest.h, p.natcubicsplbest.h, ncol=3)

data.h <- rbind(BestPoly.h,BestPie.h,Best3.h)
data.h$nama_metode <- c("Polynomial d = 4", "PC breaks = 8", "NCS Knots = 85 105 180")
data.h %>% arrange(RMSE)
##        RMSE      AIC            nama_metode
## 31 4.318277 2047.707 NCS Knots = 85 105 180
## 3  4.327918 2049.059       Polynomial d = 4
## 6  4.394890 2063.486          PC breaks = 8

mpg vs displacement

grid.arrange(p.polybest.d, p.pieconsbest.d, p.natcubicsplbest.d, ncol=3)

data.d <- rbind(BestPoly.d,BestPie.d,Best3.d)
data.d$nama_metode <- c("Polynomial d = 4", "PC breaks = 9", "NCS Knots = 120 140 231.15 400")
data.d %>% arrange(RMSE)
##        RMSE      AIC                    nama_metode
## 7  4.209561 2032.645                  PC breaks = 9
## 3  4.297514 2048.227               Polynomial d = 4
## 31 4.318260 2048.798 NCS Knots = 120 140 231.15 400

mpg vs weight

grid.arrange(p.polybest.w, p.pieconsbest.w, p.natcubicsplbest.w, ncol=3)

data.w <- rbind(BestPoly.w,BestPie.w,Best3.w)
data.w$nama_metode <- c("Polynomial d = 2", "PC breaks = 6", "NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286")
data.w %>% arrange(RMSE)
##       RMSE      AIC                                                nama_metode
## 1 4.083225 2014.527                                           Polynomial d = 2
## 2 4.126551 2021.601 NCS Knots 2285.429, 2635.000, 2986.571, 3446.143, 4096.286
## 3 4.172338 2023.892                                              PC breaks = 6

Kesimpulan

Terdapat pola hubungan non linier antara mpg dengan horsepower, displacement, dan weight dilihat dari beberapa grafik sebelumnya. Model terbaik untuk masing-masing hubungan antara mpg vs horsepower, mpg vs displacement, mpg vs displacement secara berurutan adalah model natural cubic splines dengan knots 85 105 180, piecewise constant dengan breaks 9, dan polynomial dengan derajat 2.

grid.arrange(p.natcubicsplbest.h, p.pieconsbest.d, p.polybest.w, ncol=3)

```