SOAL

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower yang optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya.

Lakukan hal ditas untuk masing-masing subset data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang). Plot model terbaik dalam satu frame.

Berikan ulasan terhadap hasil anda.

Susun bahan presentasi (dengan durasi 5-7 menit), untuk dipresentasikan minggu depan. Presenter akan diacak, sehingga semua orang harus siap presentasi.

Library

library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## Warning: package 'lubridate' was built under R version 4.1.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.1.2
library(DT)
library(ggplot2)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
attach(Auto)
## The following object is masked from package:lubridate:
## 
##     origin
## 
## The following object is masked from package:ggplot2:
## 
##     mpg
library(patchwork)
## 
## Attaching package: 'patchwork'
## 
## The following object is masked from 'package:cowplot':
## 
##     align_plots

Eksplorasi Data

Data Keseluruhan

Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:

mpg : miles per gallon

horsepower: Engine horsepower

origin: origin of car (1. American, 2. European, 3. Japanese)

Auto = Auto %>%  select(mpg, horsepower, origin)
datatable(Auto)
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  labs(
    title = "Scatter Plot of `mpg` and `horsepower`",
    x = "Horse Power",
    y = "Mile per Gallon"
  )

Plot Pemodelan mpg vs horsepower

ggplot(Auto, aes(x = horsepower, y = mpg, color = factor(origin))) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", formula = y ~ x, lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green"))

Data Masing-Masing Negara

Amerika

AutoAmerika = Auto[Auto$origin==1,] %>%  select(mpg, horsepower, origin)
datatable(AutoAmerika)
ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Amerika")

Eropa

AutoEropa = Auto[Auto$origin==2,] %>%  select(mpg, horsepower, origin)
datatable(AutoEropa)
ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Eropa")

Jepang

AutoJepang = Auto[Auto$origin==3,] %>%  select(mpg, horsepower, origin)
datatable(AutoJepang)
ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Jepang")

plot1 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "red") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Amerika")

plot2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "blue") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Eropa")

plot3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "green") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Jepang")

combined_plot <- plot1 + plot2 + plot3
combined_plot

Metode Analisis

Data Keseluruhan

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

ordo <- 2:10

best_poly <- map_dfr(ordo, function(i) {
  metric_poly <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly

# menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  mean_metric_poly
  }
)

best_poly <- cbind(ordo=ordo,best_poly)
best_poly
##   ordo     rmse      mae
## 1    2 4.351129 3.266460
## 2    3 4.355430 3.266945
## 3    4 4.367009 3.280452
## 4    5 4.318767 3.249003
## 5    6 4.317431 3.257272
## 6    7 4.293242 3.233376
## 7    8 4.309511 3.251758
## 8    9 4.323321 3.244919
## 9   10 4.371903 3.274887

Berdasarkan rmse dan mae

best_poly %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    7 4.293242 3.233376
best_poly %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    7 4.293242 3.233376

Plot CV

best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

A <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) + 
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 2, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

B <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) + 
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 7, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

ggarrange(A, B, ncol = 2, nrow = 1)

Summary Model

polinom = lm(mpg ~ poly(horsepower,2))
summary(polinom)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- Auto[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga

# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga
##   breaks     rmse      mae
## 1      2 6.147626 4.790234
## 2      3 5.775792 4.521829
## 3      4 4.985270 3.774549
## 4      5 4.712623 3.571614
## 5      6 4.644383 3.548375
## 6      7 4.554116 3.379980
## 7      8 4.437597 3.405433
## 8      9 4.549668 3.471999
## 9     10 4.589172 3.455531

Berdasarkan rmse dan mae

best_tangga %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      8 4.437597 3.405433
best_tangga %>% slice_min(mae)
##   breaks     rmse     mae
## 1      7 4.554116 3.37998

Plot CV

# Assuming best_tangga$breaks is already converted to a factor
best_tangga$breaks <- as.factor(best_tangga$breaks)

# First plot
plot_rmse <- ggplot(data = best_tangga, aes(x = breaks, y = rmse, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point() +
  ggtitle("K-Fold Cross Validation - RMSE") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

# Second plot
plot_mae <- ggplot(data = best_tangga, aes(x = breaks, y = mae, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point() +
  ggtitle("K-Fold Cross Validation - MAE") +
  xlab("Breaks") + 
  ylab("MAE")

# Combine the plots into one row
combined_plot <- plot_rmse + plot_mae

# Display the combined plot
combined_plot

Plot

pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 7), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("K=7") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

pc8 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 8), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("K=8") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

ggarrange(pc7, pc8, ncol = 2, nrow = 1)

Summary Model

tangga = lm(mpg ~ cut(horsepower,7))
summary(tangga)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.5280     0.5069   64.17   <2e-16 ***
## cut(horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:15

best_spline3 <- map_dfr(df, function(i) {
  metric_spline3 <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3 <- colMeans(metric_spline3)
  mean_metric_spline3
  }
)

best_spline3 <- cbind(df=df,best_spline3)
best_spline3
##    df     rmse      mae
## 1   2 4.330108 3.252908
## 2   3 4.348006 3.261538
## 3   4 4.329461 3.257926
## 4   5 4.313312 3.246412
## 5   6 4.310011 3.237990
## 6   7 4.294671 3.229174
## 7   8 4.293369 3.232221
## 8   9 4.281877 3.211860
## 9  10 4.260316 3.192010
## 10 11 4.288667 3.220958
## 11 12 4.285063 3.192363
## 12 13 4.302071 3.220998
## 13 14 4.322821 3.244848
## 14 15 4.305928 3.250204

Berdasarkan rmse dan mae

best_spline3 %>% slice_min(rmse)
##   df     rmse     mae
## 1 10 4.260316 3.19201
best_spline3 %>% slice_min(mae)
##   df     rmse     mae
## 1 10 4.260316 3.19201

Plot CV

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(Auto$horsepower, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7

Plot

Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

ncs <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ ns(x, df = 10), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("DF = 10") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon") +
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col = "black", lty = 2)

ncs

Summary Model

regspline = lm(mpg ~ ns(horsepower, df=10))
summary(regspline)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5003  -2.4956  -0.2086   2.2577  14.6400 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 32.462      1.680  19.324  < 2e-16 ***
## ns(horsepower, df = 10)1    -4.600      1.789  -2.572 0.010495 *  
## ns(horsepower, df = 10)2    -3.007      2.555  -1.177 0.239996    
## ns(horsepower, df = 10)3    -8.663      2.043  -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4    -7.170      2.188  -3.278 0.001142 ** 
## ns(horsepower, df = 10)5   -14.722      2.125  -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6    -8.446      2.467  -3.424 0.000684 ***
## ns(horsepower, df = 10)7   -16.947      2.080  -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8   -21.715      1.891 -11.483  < 2e-16 ***
## ns(horsepower, df = 10)9   -14.384      4.116  -3.494 0.000531 ***
## ns(horsepower, df = 10)10  -22.326      1.946 -11.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared:  0.7124, Adjusted R-squared:  0.7049 
## F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_spline3 %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Polinomial",
                "Piecewise Constant",
                "Natural Cubic Splines")

komparasi <- data.frame(nama_model,nilai_metric)
komparasi
##              nama_model     rmse      mae
## 1            Polinomial 4.293242 3.233376
## 2    Piecewise Constant 4.437597 3.405433
## 3 Natural Cubic Splines 4.260316 3.192010

Berdasarkan Plot

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Assuming 'origin' is a factor variable
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

# Create the poly2 plot without origin legend
poly2 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 2, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("poly2") +
  guides(color = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Create the pc7 plot without origin legend
pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 7), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("pc7") +
  guides(color = FALSE)

# Create the nc10 plot without origin legend
nc10 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ ns(x, knots = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7)), 
              col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("nc10") +
  guides(color = FALSE)

# Arrange the plots in a grid
grid.arrange(poly2, pc7, nc10, ncol = 3)

Berdasarkan rmse dan mae

Data Amerika

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

ordo <- 2:10

best_poly_amerika <- map_dfr(ordo, function(i) {
  metric_poly_amerika <- map_dfr(cross_val_amerika$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoAmerika[x$in_id,])
                              pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
                              truth <- AutoAmerika[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_amerika

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_amerika <- colMeans(metric_poly_amerika)
  mean_metric_poly_amerika
  }
)

best_poly_amerika <- cbind(ordo=ordo,best_poly_amerika)
best_poly_amerika
##   ordo     rmse      mae
## 1    2 3.799433 2.887688
## 2    3 3.804542 2.901911
## 3    4 3.823584 2.900848
## 4    5 3.808271 2.892205
## 5    6 3.832598 2.912551
## 6    7 3.781019 2.853644
## 7    8 3.791754 2.866697
## 8    9 3.890987 2.898198
## 9   10 4.021537 2.953468

Berdasarkan rmse dan mae

best_poly_amerika %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    7 3.781019 2.853644
best_poly_amerika %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    7 3.781019 2.853644

Plot CV

best_poly_amerika$ordo <- as.factor(best_poly_amerika$ordo)
ggplot(data=best_poly_amerika, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

H <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

I <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(H, I, ncol = 2, nrow = 1)

Summary Model

polinom_amerika = lm(mpg ~ poly(horsepower,2))
summary(polinom_amerika)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

breaks <- 2:10

best_tangga_amerika <- map_dfr(breaks, function(i){
  metric_tangga_amerika <- map_dfr(cross_val_amerika$splits,
                           function(x){
                             training <- AutoAmerika[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoAmerika[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_amerika

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_amerika <- colMeans(metric_tangga_amerika)

mean_metric_tangga_amerika
})
best_tangga_amerika <- cbind(breaks=breaks,best_tangga_amerika)
best_tangga_amerika
##   breaks     rmse      mae
## 1      2 4.997476 3.818722
## 2      3 4.759908 3.710652
## 3      4 4.139362 3.076968
## 4      5 3.966889 3.067326
## 5      6 4.222039 3.206859
## 6      7 4.150320 3.162726
## 7      8 3.945534 2.949718
## 8      9 3.737522 2.856986
## 9     10 3.816464 2.963023

Berdasarkan rmse dan mae

best_tangga_amerika %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      9 3.737522 2.856986
best_tangga_amerika %>% slice_min(mae)
##   breaks     rmse      mae
## 1      9 3.737522 2.856986

Plot CV

best_tangga_amerika$breaks <- as.factor(best_tangga_amerika$breaks)

ggplot(data = best_tangga_amerika, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc9<- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=9") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc9, ncol = 1, nrow = 1)

Summary Model

tangga_amerika = lm(mpg ~ cut(horsepower,9))
summary(tangga_amerika)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 9))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0593  -2.8829  -0.0593   2.4977  16.2133 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    34.3711     0.7379  46.577  < 2e-16 ***
## cut(horsepower, 9)(66.4,86.9]  -5.3118     0.8580  -6.191 1.54e-09 ***
## cut(horsepower, 9)(86.9,107]  -11.4882     0.8550 -13.437  < 2e-16 ***
## cut(horsepower, 9)(107,128]   -13.7332     1.0506 -13.071  < 2e-16 ***
## cut(horsepower, 9)(128,148]   -17.8844     1.1110 -16.097  < 2e-16 ***
## cut(horsepower, 9)(148,169]   -19.8544     1.0580 -18.766  < 2e-16 ***
## cut(horsepower, 9)(169,189]   -20.5711     1.3871 -14.830  < 2e-16 ***
## cut(horsepower, 9)(189,210]   -21.8086     1.7695 -12.324  < 2e-16 ***
## cut(horsepower, 9)(210,230]   -21.2599     1.6864 -12.607  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.549 on 383 degrees of freedom
## Multiple R-squared:  0.6673, Adjusted R-squared:  0.6603 
## F-statistic:    96 on 8 and 383 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

df <- 2:15

best_spline3_amerika <- map_dfr(df, function(i) {
  metric_spline3_amerika <- map_dfr(cross_val_amerika$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoAmerika[x$in_id,])
                              pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
                              truth <- AutoAmerika[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_amerika

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_amerika <- colMeans(metric_spline3_amerika)
  mean_metric_spline3_amerika
  }
)

best_spline3_amerika <- cbind(df=df,best_spline3_amerika)
best_spline3_amerika
##    df     rmse      mae
## 1   2 3.786961 2.889484
## 2   3 3.797055 2.892392
## 3   4 3.815970 2.903793
## 4   5 3.796834 2.852705
## 5   6 3.800473 2.869119
## 6   7 3.769022 2.814088
## 7   8 3.774172 2.816419
## 8   9 3.804854 2.843316
## 9  10 3.814011 2.838197
## 10 11 3.906193 2.898857
## 11 12 3.919305 2.885175
## 12 13 3.930376 2.888267
## 13 14 3.946494 2.921426
## 14 15 3.858799 2.880199

Berdasarkan rmse dan mae

best_spline3_amerika %>% slice_min(rmse)
##   df     rmse      mae
## 1  7 3.769022 2.814088
best_spline3_amerika %>% slice_min(mae)
##   df     rmse      mae
## 1  7 3.769022 2.814088

Plot CV

best_spline3_amerika$df <- as.factor(best_spline3_amerika$df)
ggplot(data=best_spline3_amerika, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoAmerika$horsepower, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  82.85714  90.00000 100.00000 110.00000 145.00000 165.00000

Plot

ncs <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =7") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(82.5, 90, 100, 110, 145, 165), col="grey50", lty=2)
ncs

Summary Model

regspline_amerika = lm(mpg ~ ns(horsepower, df=7))
summary(regspline_amerika)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6158  -2.5868  -0.1339   2.2474  14.4829 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               33.748      1.563  21.592  < 2e-16 ***
## ns(horsepower, df = 7)1   -7.888      1.694  -4.657 4.43e-06 ***
## ns(horsepower, df = 7)2   -7.844      1.923  -4.078 5.52e-05 ***
## ns(horsepower, df = 7)3  -14.226      1.812  -7.853 4.11e-14 ***
## ns(horsepower, df = 7)4  -12.527      2.077  -6.032 3.81e-09 ***
## ns(horsepower, df = 7)5  -24.049      1.715 -14.021  < 2e-16 ***
## ns(horsepower, df = 7)6  -19.387      3.664  -5.291 2.05e-07 ***
## ns(horsepower, df = 7)7  -20.803      1.761 -11.816  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.272 on 384 degrees of freedom
## Multiple R-squared:  0.7057, Adjusted R-squared:  0.7004 
## F-statistic: 131.6 on 7 and 384 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_amerika <- rbind(best_poly_amerika %>% select(-1) %>% slice_min(rmse),
                      best_tangga_amerika %>% select(-1) %>% slice_min(rmse),
                      best_spline3_amerika %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_amerika <- c("Polinomial",
                        "Piecewise Constant",
                        "Natural Cubic Splines")

komparasi_amerika <- data.frame(nama_model_amerika,nilai_metric_amerika)
komparasi_amerika
##      nama_model_amerika     rmse      mae
## 1            Polinomial 3.781019 2.853644
## 2    Piecewise Constant 3.737522 2.856986
## 3 Natural Cubic Splines 3.769022 2.814088

Berdasarkan Plot

rl_A <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "red") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,9), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="red") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(82.5, 90, 100, 110, 145, 165)), col = "black", se = F)+
  theme_bw()

plot_grid(rl_A, poly2, pc9, nc7, labels = c("rl_A", "poly2","pc9", "nc7"), label_size = 6)

Data Eropa

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
ordo <- 2:10

best_poly_eropa <- map_dfr(ordo, function(i) {
  metric_poly_eropa <- map_dfr(cross_val_eropa$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoEropa[x$in_id,])
                              pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
                              truth <- AutoEropa[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_eropa

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_eropa <- colMeans(metric_poly_eropa)
  mean_metric_poly_eropa
  }
)

best_poly_eropa <- cbind(ordo=ordo,best_poly_eropa)
best_poly_eropa
##   ordo      rmse      mae
## 1    2  4.805676 3.906728
## 2    3  4.832309 3.944713
## 3    4  4.878846 3.984634
## 4    5  5.386692 4.428764
## 5    6  5.906675 4.692453
## 6    7  6.286173 4.754299
## 7    8  6.530337 4.785631
## 8    9 16.788402 8.940515
## 9   10 12.961372 7.304674

Berdasarkan rmse dan mae

best_poly_eropa %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    2 4.805676 3.906728
best_poly_eropa %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    2 4.805676 3.906728

Plot CV

best_poly_eropa$ordo <- as.factor(best_poly_eropa$ordo)
ggplot(data=best_poly_eropa, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

J <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

K <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(J, K, ncol = 2, nrow = 1)

Summary Model

polinom_eropa = lm(mpg ~ poly(horsepower,2))
summary(polinom_eropa)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
breaks <- 2:9

best_tangga_eropa <- map_dfr(breaks, function(i){
  metric_tangga_eropa <- map_dfr(cross_val_eropa$splits,
                           function(x){
                             training <- AutoEropa[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoEropa[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_eropa

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_eropa <- colMeans(metric_tangga_eropa)

mean_metric_tangga_eropa
})
best_tangga_eropa <- cbind(breaks=breaks,best_tangga_eropa)
best_tangga_eropa
##   breaks     rmse      mae
## 1      2 5.462277 4.313951
## 2      3 5.395409 4.482648
## 3      4 5.226264 4.308287
## 4      5 4.963259 3.982939
## 5      6 5.133934 4.101179
## 6      7 4.949469 3.951007
## 7      8 5.116465 4.154967
## 8      9 5.225155 4.293021

Berdasarkan rmse dan mae

best_tangga_eropa %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      7 4.949469 3.951007
best_tangga_eropa %>% slice_min(mae)
##   breaks     rmse      mae
## 1      7 4.949469 3.951007

Plot CV

best_tangga_eropa$breaks <- as.factor(best_tangga_eropa$breaks)

ggplot(data = best_tangga_eropa, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc5<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,5), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=5") +
  xlab("Horse Power") + ylab("mile per gallon")

pc7<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,7), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=7") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc5, pc7, ncol = 2, nrow = 1)

Summary Model

tangga_eropa = lm(mpg ~ cut(horsepower,7))
summary(tangga_eropa)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.5280     0.5069   64.17   <2e-16 ***
## cut(horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
df <- 2:15

best_spline3_eropa <- map_dfr(df, function(i) {
  metric_spline3_eropa <- map_dfr(cross_val_eropa$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoEropa[x$in_id,])
                              pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
                              truth <- AutoEropa[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_eropa

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_eropa <- colMeans(metric_spline3_eropa)
  mean_metric_spline3_eropa
  }
)

best_spline3_eropa <- cbind(df=df,best_spline3_eropa)
best_spline3_eropa
##    df     rmse      mae
## 1   2 4.807144 3.901102
## 2   3 4.830632 3.946643
## 3   4 4.921306 4.013528
## 4   5 5.000935 4.056378
## 5   6 5.065292 4.049801
## 6   7 5.126662 4.096608
## 7   8 5.217695 4.177479
## 8   9 5.382172 4.306452
## 9  10 5.465415 4.374886
## 10 11 5.564501 4.422779
## 11 12 5.537463 4.408483
## 12 13 5.513853 4.343441
## 13 14 5.588090 4.477874
## 14 15 5.659180 4.506912

Berdasarkan rmse dan mae

best_spline3_eropa %>% slice_min(rmse)
##   df     rmse      mae
## 1  2 4.807144 3.901102
best_spline3_eropa %>% slice_min(mae)
##   df     rmse      mae
## 1  2 4.807144 3.901102

Plot CV

best_spline3_eropa$df <- as.factor(best_spline3_eropa$df)
ggplot(data=best_spline3_eropa, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoEropa$horsepower, df=2),"knots")
##  50% 
## 76.5

Plot

ncs <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =2") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(76.5), col="black", lty=2)
ncs

Summary Model

regspline_eropa = lm(mpg ~ ns(horsepower, df=2))
summary(regspline_eropa)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8454  -2.4171  -0.1837   2.2563  15.8921 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              38.3090     0.6779   56.52   <2e-16 ***
## ns(horsepower, df = 2)1 -41.4766     1.5831  -26.20   <2e-16 ***
## ns(horsepower, df = 2)2 -16.0003     1.2308  -13.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.354 on 389 degrees of freedom
## Multiple R-squared:  0.6904, Adjusted R-squared:  0.6888 
## F-statistic: 433.7 on 2 and 389 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_eropa <- rbind(best_poly_eropa %>% select(-1) %>% slice_min(rmse),
                      best_tangga_eropa %>% select(-1) %>% slice_min(rmse),
                      best_spline3_eropa %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_eropa <- c("Polinomial",
                        "Piecewise Constant",
                        "Natural Cubic Splines")

komparasi_eropa <- data.frame(nama_model_eropa,nilai_metric_eropa)
komparasi_eropa
##        nama_model_eropa     rmse      mae
## 1            Polinomial 4.805676 3.906728
## 2    Piecewise Constant 4.949469 3.951007
## 3 Natural Cubic Splines 4.807144 3.901102

Berdasarkan Plot

rl_E <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "blue") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc7 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="blue") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "black", se = F)+
  theme_bw()

plot_grid(rl_E, poly2, pc7, nc2, labels = c("rl_E", "poly2","pc7", "nc2"), label_size = 6)

Data Jepang

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
ordo <- 2:10

best_poly_jepang <- map_dfr(ordo, function(i) {
  metric_poly_jepang <- map_dfr(cross_val_jepang$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoJepang[x$in_id,])
                              pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
                              truth <- AutoJepang[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_jepang

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_jepang <- colMeans(metric_poly_jepang)
  mean_metric_poly_jepang
  }
)

best_poly_jepang <- cbind(ordo=ordo,best_poly_jepang)
best_poly_jepang
##   ordo      rmse       mae
## 1    2  4.522705  3.624496
## 2    3  3.962162  3.024320
## 3    4  4.380986  3.311081
## 4    5  3.963580  3.013920
## 5    6  5.441724  3.816592
## 6    7  8.538692  5.108307
## 7    8 12.147461  6.621785
## 8    9 18.314340  9.158679
## 9   10 33.718613 15.414116

Berdasarkan rmse dan mae

best_poly_jepang %>% slice_min(rmse)
##   ordo     rmse     mae
## 1    3 3.962162 3.02432
best_poly_jepang %>% slice_min(rmse)
##   ordo     rmse     mae
## 1    3 3.962162 3.02432

Plot CV

best_poly_jepang$ordo <- as.factor(best_poly_jepang$ordo)
ggplot(data=best_poly_jepang, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

M <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

N <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 5") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(M, N, ncol = 2, nrow = 1)

Summary Model

polinom_jepang = lm(mpg ~ poly(horsepower,3))
summary(polinom_jepang)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7039  -2.4491  -0.1519   2.2035  15.8159 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.446      0.221 106.105   <2e-16 ***
## poly(horsepower, 3)1 -120.138      4.375 -27.460   <2e-16 ***
## poly(horsepower, 3)2   44.090      4.375  10.078   <2e-16 ***
## poly(horsepower, 3)3   -3.949      4.375  -0.903    0.367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.375 on 388 degrees of freedom
## Multiple R-squared:  0.6882, Adjusted R-squared:  0.6858 
## F-statistic: 285.5 on 3 and 388 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
breaks <- 2:5

best_tangga_jepang <- map_dfr(breaks, function(i){
  metric_tangga_jepang <- map_dfr(cross_val_jepang$splits,
                           function(x){
                             training <- AutoJepang[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoJepang[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_jepang

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_jepang <- colMeans(metric_tangga_jepang)

mean_metric_tangga_jepang
})
best_tangga_jepang <- cbind(breaks=breaks,best_tangga_jepang)
best_tangga_jepang
##   breaks     rmse      mae
## 1      2 4.306154 3.254086
## 2      3 4.201949 3.360962
## 3      4 4.072662 3.248551
## 4      5 4.226607 3.410855

Berdasarkan rmse dan mae

best_tangga_jepang %>% slice_min(rmse)
##   breaks     rmse      mae
## 1      4 4.072662 3.248551
best_tangga_jepang %>% slice_min(mae)
##   breaks     rmse      mae
## 1      4 4.072662 3.248551

Plot CV

best_tangga_jepang$breaks <- as.factor(best_tangga_jepang$breaks)

ggplot(data = best_tangga_jepang, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc4<- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,4), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=4") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc4, ncol = 1, nrow = 1)

Summary Model

tangga_jepang = lm(mpg ~ cut(horsepower,4))
summary(tangga_jepang)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 4))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0533  -2.7826  -0.2927   2.8593  17.5467 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  29.0533     0.3581   81.12   <2e-16 ***
## cut(horsepower, 4)(92,138]   -8.4506     0.5946  -14.21   <2e-16 ***
## cut(horsepower, 4)(138,184] -14.2707     0.7005  -20.37   <2e-16 ***
## cut(horsepower, 4)(184,230] -16.2004     1.2647  -12.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.001 on 388 degrees of freedom
## Multiple R-squared:  0.5926, Adjusted R-squared:  0.5894 
## F-statistic: 188.1 on 3 and 388 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
df <- 2:15

best_spline3_jepang <- map_dfr(df, function(i) {
  metric_spline3_jepang <- map_dfr(cross_val_jepang$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoJepang[x$in_id,])
                              pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
                              truth <- AutoJepang[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_jepang

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_jepang <- colMeans(metric_spline3_jepang)
  mean_metric_spline3_jepang
  }
)

best_spline3_jepang <- cbind(df=df,best_spline3_jepang)
best_spline3_jepang
##    df     rmse      mae
## 1   2 4.537131 3.647191
## 2   3 4.143445 3.215053
## 3   4 4.215103 3.264447
## 4   5 4.182602 3.158549
## 5   6 4.186622 3.186509
## 6   7 4.189215 3.225135
## 7   8 4.341229 3.347164
## 8   9 4.389358 3.393529
## 9  10 4.405940 3.369006
## 10 11 4.474687 3.458585
## 11 12 4.429249 3.448116
## 12 13 4.445900 3.402477
## 13 14 4.560086 3.583991
## 14 15 4.540602 3.527475

Berdasarkan rmse dan mae

best_spline3_jepang %>% slice_min(rmse)
##   df     rmse      mae
## 1  3 4.143445 3.215053
best_spline3_jepang %>% slice_min(mae)
##   df     rmse      mae
## 1  5 4.182602 3.158549

Plot CV

best_spline3_jepang$df <- as.factor(best_spline3_jepang$df)
ggplot(data=best_spline3_jepang, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoJepang$horsepower, df=3),"knots")
## 33.33333% 66.66667% 
##        68        90

Plot

ncs <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="green")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=3), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =3") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(68, 90), col="grey50", lty=2)
ncs

Summary Model

regspline_jepang = lm(mpg ~ ns(horsepower, df=3))
summary(regspline_jepang)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8308  -2.4068  -0.1672   2.2524  15.9184 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               38.276      1.039   36.85   <2e-16 ***
## ns(horsepower, df = 3)1  -21.311      1.054  -20.22   <2e-16 ***
## ns(horsepower, df = 3)2  -35.274      2.313  -15.25   <2e-16 ***
## ns(horsepower, df = 3)3  -19.669      1.435  -13.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.362 on 388 degrees of freedom
## Multiple R-squared:  0.6901, Adjusted R-squared:  0.6877 
## F-statistic:   288 on 3 and 388 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_jepang <- rbind(best_poly_jepang %>% select(-1) %>% slice_min(rmse),
                      best_tangga_jepang %>% select(-1) %>% slice_min(rmse),
                      best_spline3_jepang %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_jepang <- c("Polinomial",
                       "Piecewise Constant",
                       "Natural Cubic Splines")

komparasi_jepang <- data.frame(nama_model_jepang,nilai_metric_jepang)
komparasi_jepang
##       nama_model_jepang     rmse      mae
## 1            Polinomial 3.962162 3.024320
## 2    Piecewise Constant 4.072662 3.248551
## 3 Natural Cubic Splines 4.143445 3.215053

Berdasarkan Plot

rl_J <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "green") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,3), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="green") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(68, 90)), col = "black", se = FALSE) +
  geom_vline(xintercept = c(68, 90), col="black", lty=2)+
  theme_bw()

plot_grid(rl_J, poly3, pc4, nc3, labels = c("rl_J", "poly3","pc4", "nc3"), label_size = 6)