SOAL

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower yang optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya.

Regesi Polinomial
Piecewise constant
Natural cubic splines

Lakukan hal ditas untuk masing-masing subset data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang). Plot model terbaik dalam satu frame.

Berikan ulasan terhadap hasil anda.

Susun bahan presentasi (dengan durasi 5-7 menit), untuk dipresentasikan minggu depan. Presenter akan diacak, sehingga semua orang harus siap presentasi.

Library

library(ISLR)
library(rsample)
library(splines)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## Warning: package 'tibble' was built under R version 4.1.2

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.2

## Warning: package 'forcats' was built under R version 4.1.2

## Warning: package 'lubridate' was built under R version 4.1.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.1.2

library(DT)
library(ggplot2)
library(cowplot)

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

attach(Auto)

## The following object is masked from package:lubridate:
## 
##     origin
## 
## The following object is masked from package:ggplot2:
## 
##     mpg

library(patchwork)

## 
## Attaching package: 'patchwork'
## 
## The following object is masked from 'package:cowplot':
## 
##     align_plots

Eksplorasi Data

Data Keseluruhan

Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:

mpg : miles per gallon

horsepower: Engine horsepower

origin: origin of car (1. American, 2. European, 3. Japanese)

Auto = Auto %>%  select(mpg, horsepower, origin)
datatable(Auto)

Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  labs(
    title = "Scatter Plot of `mpg` and `horsepower`",
    x = "Horse Power",
    y = "Mile per Gallon"
  )

Plot Pemodelan mpg vs horsepower

ggplot(Auto, aes(x = horsepower, y = mpg, color = factor(origin))) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", formula = y ~ x, lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green"))

Data Masing-Masing Negara

Amerika

AutoAmerika = Auto[Auto$origin==1,] %>%  select(mpg, horsepower, origin)
datatable(AutoAmerika)

ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Amerika")

Eropa

AutoEropa = Auto[Auto$origin==2,] %>%  select(mpg, horsepower, origin)
datatable(AutoEropa)

ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Eropa")

Jepang

AutoJepang = Auto[Auto$origin==3,] %>%  select(mpg, horsepower, origin)
datatable(AutoJepang)

ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  theme_bw() +
   labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
        subtitle = "Jepang")

plot1 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "red") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Amerika")

plot2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "blue") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Eropa")

plot3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "green") +
  theme_bw() +
  labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
       subtitle = "Jepang")

combined_plot <- plot1 + plot2 + plot3
combined_plot

Metode Analisis

Data Keseluruhan

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

ordo <- 2:10

best_poly <- map_dfr(ordo, function(i) {
  metric_poly <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly

# menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  mean_metric_poly
  }
)

best_poly <- cbind(ordo=ordo,best_poly)
best_poly

##   ordo     rmse      mae
## 1    2 4.351129 3.266460
## 2    3 4.355430 3.266945
## 3    4 4.367009 3.280452
## 4    5 4.318767 3.249003
## 5    6 4.317431 3.257272
## 6    7 4.293242 3.233376
## 7    8 4.309511 3.251758
## 8    9 4.323321 3.244919
## 9   10 4.371903 3.274887

Berdasarkan rmse dan mae

best_poly %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    7 4.293242 3.233376

best_poly %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    7 4.293242 3.233376

Plot CV

best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

A <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) + 
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 2, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

B <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) + 
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 7, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

ggarrange(A, B, ncol = 2, nrow = 1)

Summary Model

polinom = lm(mpg ~ poly(horsepower,2))
summary(polinom)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- Auto[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga

# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga

##   breaks     rmse      mae
## 1      2 6.147626 4.790234
## 2      3 5.775792 4.521829
## 3      4 4.985270 3.774549
## 4      5 4.712623 3.571614
## 5      6 4.644383 3.548375
## 6      7 4.554116 3.379980
## 7      8 4.437597 3.405433
## 8      9 4.549668 3.471999
## 9     10 4.589172 3.455531

Berdasarkan rmse dan mae

best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      8 4.437597 3.405433

best_tangga %>% slice_min(mae)

##   breaks     rmse     mae
## 1      7 4.554116 3.37998

Plot CV

# Assuming best_tangga$breaks is already converted to a factor
best_tangga$breaks <- as.factor(best_tangga$breaks)

# First plot
plot_rmse <- ggplot(data = best_tangga, aes(x = breaks, y = rmse, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point() +
  ggtitle("K-Fold Cross Validation - RMSE") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

# Second plot
plot_mae <- ggplot(data = best_tangga, aes(x = breaks, y = mae, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point() +
  ggtitle("K-Fold Cross Validation - MAE") +
  xlab("Breaks") + 
  ylab("MAE")

# Combine the plots into one row
combined_plot <- plot_rmse + plot_mae

# Display the combined plot
combined_plot

Plot

pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 7), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("K=7") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

pc8 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 8), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
  ggtitle("K=8") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon")

ggarrange(pc7, pc8, ncol = 2, nrow = 1)

Summary Model

tangga = lm(mpg ~ cut(horsepower,7))
summary(tangga)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.5280     0.5069   64.17   <2e-16 ***
## cut(horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:15

best_spline3 <- map_dfr(df, function(i) {
  metric_spline3 <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3 <- colMeans(metric_spline3)
  mean_metric_spline3
  }
)

best_spline3 <- cbind(df=df,best_spline3)
best_spline3

##    df     rmse      mae
## 1   2 4.330108 3.252908
## 2   3 4.348006 3.261538
## 3   4 4.329461 3.257926
## 4   5 4.313312 3.246412
## 5   6 4.310011 3.237990
## 6   7 4.294671 3.229174
## 7   8 4.293369 3.232221
## 8   9 4.281877 3.211860
## 9  10 4.260316 3.192010
## 10 11 4.288667 3.220958
## 11 12 4.285063 3.192363
## 12 13 4.302071 3.220998
## 13 14 4.322821 3.244848
## 14 15 4.305928 3.250204

Berdasarkan rmse dan mae

best_spline3 %>% slice_min(rmse)

##   df     rmse     mae
## 1 10 4.260316 3.19201

best_spline3 %>% slice_min(mae)

##   df     rmse     mae
## 1 10 4.260316 3.19201

Plot CV

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(Auto$horsepower, df=10),"knots")

##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7

Plot

Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

ncs <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ ns(x, df = 10), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("DF = 10") +
  xlab("Horse Power") + 
  ylab("Mile per Gallon") +
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col = "black", lty = 2)

ncs

Summary Model

regspline = lm(mpg ~ ns(horsepower, df=10))
summary(regspline)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5003  -2.4956  -0.2086   2.2577  14.6400 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 32.462      1.680  19.324  < 2e-16 ***
## ns(horsepower, df = 10)1    -4.600      1.789  -2.572 0.010495 *  
## ns(horsepower, df = 10)2    -3.007      2.555  -1.177 0.239996    
## ns(horsepower, df = 10)3    -8.663      2.043  -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4    -7.170      2.188  -3.278 0.001142 ** 
## ns(horsepower, df = 10)5   -14.722      2.125  -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6    -8.446      2.467  -3.424 0.000684 ***
## ns(horsepower, df = 10)7   -16.947      2.080  -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8   -21.715      1.891 -11.483  < 2e-16 ***
## ns(horsepower, df = 10)9   -14.384      4.116  -3.494 0.000531 ***
## ns(horsepower, df = 10)10  -22.326      1.946 -11.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared:  0.7124, Adjusted R-squared:  0.7049 
## F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_spline3 %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Polinomial",
                "Piecewise Constant",
                "Natural Cubic Splines")

komparasi <- data.frame(nama_model,nilai_metric)
komparasi

##              nama_model     rmse      mae
## 1            Polinomial 4.293242 3.233376
## 2    Piecewise Constant 4.437597 3.405433
## 3 Natural Cubic Splines 4.260316 3.192010

Berdasarkan Plot

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

# Assuming 'origin' is a factor variable
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))

# Create the poly2 plot without origin legend
poly2 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ poly(x, 2, raw = TRUE), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("poly2") +
  guides(color = FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Create the pc7 plot without origin legend
pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ cut(x, 7), 
              lty = 1, col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("pc7") +
  guides(color = FALSE)

# Create the nc10 plot without origin legend
nc10 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.55) +
  stat_smooth(method = "lm", 
              formula = y ~ ns(x, knots = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7)), 
              col = "black", se = FALSE) +
  theme_bw() +
  ggtitle("nc10") +
  guides(color = FALSE)

# Arrange the plots in a grid
grid.arrange(poly2, pc7, nc10, ncol = 3)

Berdasarkan rmse dan mae

Data Amerika

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

ordo <- 2:10

best_poly_amerika <- map_dfr(ordo, function(i) {
  metric_poly_amerika <- map_dfr(cross_val_amerika$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoAmerika[x$in_id,])
                              pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
                              truth <- AutoAmerika[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_amerika

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_amerika <- colMeans(metric_poly_amerika)
  mean_metric_poly_amerika
  }
)

best_poly_amerika <- cbind(ordo=ordo,best_poly_amerika)
best_poly_amerika

##   ordo     rmse      mae
## 1    2 3.799433 2.887688
## 2    3 3.804542 2.901911
## 3    4 3.823584 2.900848
## 4    5 3.808271 2.892205
## 5    6 3.832598 2.912551
## 6    7 3.781019 2.853644
## 7    8 3.791754 2.866697
## 8    9 3.890987 2.898198
## 9   10 4.021537 2.953468

Berdasarkan rmse dan mae

best_poly_amerika %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    7 3.781019 2.853644

best_poly_amerika %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    7 3.781019 2.853644

Plot CV

best_poly_amerika$ordo <- as.factor(best_poly_amerika$ordo)
ggplot(data=best_poly_amerika, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

H <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

I <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(H, I, ncol = 2, nrow = 1)

Summary Model

polinom_amerika = lm(mpg ~ poly(horsepower,2))
summary(polinom_amerika)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

breaks <- 2:10

best_tangga_amerika <- map_dfr(breaks, function(i){
  metric_tangga_amerika <- map_dfr(cross_val_amerika$splits,
                           function(x){
                             training <- AutoAmerika[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoAmerika[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_amerika

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_amerika <- colMeans(metric_tangga_amerika)

mean_metric_tangga_amerika
})
best_tangga_amerika <- cbind(breaks=breaks,best_tangga_amerika)
best_tangga_amerika

##   breaks     rmse      mae
## 1      2 4.997476 3.818722
## 2      3 4.759908 3.710652
## 3      4 4.139362 3.076968
## 4      5 3.966889 3.067326
## 5      6 4.222039 3.206859
## 6      7 4.150320 3.162726
## 7      8 3.945534 2.949718
## 8      9 3.737522 2.856986
## 9     10 3.816464 2.963023

Berdasarkan rmse dan mae

best_tangga_amerika %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      9 3.737522 2.856986

best_tangga_amerika %>% slice_min(mae)

##   breaks     rmse      mae
## 1      9 3.737522 2.856986

Plot CV

best_tangga_amerika$breaks <- as.factor(best_tangga_amerika$breaks)

ggplot(data = best_tangga_amerika, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc9<- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=9") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc9, ncol = 1, nrow = 1)

Summary Model

tangga_amerika = lm(mpg ~ cut(horsepower,9))
summary(tangga_amerika)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 9))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0593  -2.8829  -0.0593   2.4977  16.2133 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    34.3711     0.7379  46.577  < 2e-16 ***
## cut(horsepower, 9)(66.4,86.9]  -5.3118     0.8580  -6.191 1.54e-09 ***
## cut(horsepower, 9)(86.9,107]  -11.4882     0.8550 -13.437  < 2e-16 ***
## cut(horsepower, 9)(107,128]   -13.7332     1.0506 -13.071  < 2e-16 ***
## cut(horsepower, 9)(128,148]   -17.8844     1.1110 -16.097  < 2e-16 ***
## cut(horsepower, 9)(148,169]   -19.8544     1.0580 -18.766  < 2e-16 ***
## cut(horsepower, 9)(169,189]   -20.5711     1.3871 -14.830  < 2e-16 ***
## cut(horsepower, 9)(189,210]   -21.8086     1.7695 -12.324  < 2e-16 ***
## cut(horsepower, 9)(210,230]   -21.2599     1.6864 -12.607  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.549 on 383 degrees of freedom
## Multiple R-squared:  0.6673, Adjusted R-squared:  0.6603 
## F-statistic:    96 on 8 and 383 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")

df <- 2:15

best_spline3_amerika <- map_dfr(df, function(i) {
  metric_spline3_amerika <- map_dfr(cross_val_amerika$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoAmerika[x$in_id,])
                              pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
                              truth <- AutoAmerika[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_amerika

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_amerika <- colMeans(metric_spline3_amerika)
  mean_metric_spline3_amerika
  }
)

best_spline3_amerika <- cbind(df=df,best_spline3_amerika)
best_spline3_amerika

##    df     rmse      mae
## 1   2 3.786961 2.889484
## 2   3 3.797055 2.892392
## 3   4 3.815970 2.903793
## 4   5 3.796834 2.852705
## 5   6 3.800473 2.869119
## 6   7 3.769022 2.814088
## 7   8 3.774172 2.816419
## 8   9 3.804854 2.843316
## 9  10 3.814011 2.838197
## 10 11 3.906193 2.898857
## 11 12 3.919305 2.885175
## 12 13 3.930376 2.888267
## 13 14 3.946494 2.921426
## 14 15 3.858799 2.880199

Berdasarkan rmse dan mae

best_spline3_amerika %>% slice_min(rmse)

##   df     rmse      mae
## 1  7 3.769022 2.814088

best_spline3_amerika %>% slice_min(mae)

##   df     rmse      mae
## 1  7 3.769022 2.814088

Plot CV

best_spline3_amerika$df <- as.factor(best_spline3_amerika$df)
ggplot(data=best_spline3_amerika, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoAmerika$horsepower, df=7),"knots")

## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  82.85714  90.00000 100.00000 110.00000 145.00000 165.00000

Plot

ncs <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =7") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(82.5, 90, 100, 110, 145, 165), col="grey50", lty=2)
ncs

Summary Model

regspline_amerika = lm(mpg ~ ns(horsepower, df=7))
summary(regspline_amerika)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6158  -2.5868  -0.1339   2.2474  14.4829 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               33.748      1.563  21.592  < 2e-16 ***
## ns(horsepower, df = 7)1   -7.888      1.694  -4.657 4.43e-06 ***
## ns(horsepower, df = 7)2   -7.844      1.923  -4.078 5.52e-05 ***
## ns(horsepower, df = 7)3  -14.226      1.812  -7.853 4.11e-14 ***
## ns(horsepower, df = 7)4  -12.527      2.077  -6.032 3.81e-09 ***
## ns(horsepower, df = 7)5  -24.049      1.715 -14.021  < 2e-16 ***
## ns(horsepower, df = 7)6  -19.387      3.664  -5.291 2.05e-07 ***
## ns(horsepower, df = 7)7  -20.803      1.761 -11.816  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.272 on 384 degrees of freedom
## Multiple R-squared:  0.7057, Adjusted R-squared:  0.7004 
## F-statistic: 131.6 on 7 and 384 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_amerika <- rbind(best_poly_amerika %>% select(-1) %>% slice_min(rmse),
                      best_tangga_amerika %>% select(-1) %>% slice_min(rmse),
                      best_spline3_amerika %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_amerika <- c("Polinomial",
                        "Piecewise Constant",
                        "Natural Cubic Splines")

komparasi_amerika <- data.frame(nama_model_amerika,nilai_metric_amerika)
komparasi_amerika

##      nama_model_amerika     rmse      mae
## 1            Polinomial 3.781019 2.853644
## 2    Piecewise Constant 3.737522 2.856986
## 3 Natural Cubic Splines 3.769022 2.814088

Berdasarkan Plot

rl_A <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "red") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,9), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="red") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(82.5, 90, 100, 110, 145, 165)), col = "black", se = F)+
  theme_bw()

plot_grid(rl_A, poly2, pc9, nc7, labels = c("rl_A", "poly2","pc9", "nc7"), label_size = 6)

Data Eropa

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

ordo <- 2:10

best_poly_eropa <- map_dfr(ordo, function(i) {
  metric_poly_eropa <- map_dfr(cross_val_eropa$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoEropa[x$in_id,])
                              pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
                              truth <- AutoEropa[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_eropa

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_eropa <- colMeans(metric_poly_eropa)
  mean_metric_poly_eropa
  }
)

best_poly_eropa <- cbind(ordo=ordo,best_poly_eropa)
best_poly_eropa

##   ordo      rmse      mae
## 1    2  4.805676 3.906728
## 2    3  4.832309 3.944713
## 3    4  4.878846 3.984634
## 4    5  5.386692 4.428764
## 5    6  5.906675 4.692453
## 6    7  6.286173 4.754299
## 7    8  6.530337 4.785631
## 8    9 16.788402 8.940515
## 9   10 12.961372 7.304674

Berdasarkan rmse dan mae

best_poly_eropa %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    2 4.805676 3.906728

best_poly_eropa %>% slice_min(rmse)

##   ordo     rmse      mae
## 1    2 4.805676 3.906728

Plot CV

best_poly_eropa$ordo <- as.factor(best_poly_eropa$ordo)
ggplot(data=best_poly_eropa, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

J <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

K <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(J, K, ncol = 2, nrow = 1)

Summary Model

polinom_eropa = lm(mpg ~ poly(horsepower,2))
summary(polinom_eropa)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

breaks <- 2:9

best_tangga_eropa <- map_dfr(breaks, function(i){
  metric_tangga_eropa <- map_dfr(cross_val_eropa$splits,
                           function(x){
                             training <- AutoEropa[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoEropa[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_eropa

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_eropa <- colMeans(metric_tangga_eropa)

mean_metric_tangga_eropa
})
best_tangga_eropa <- cbind(breaks=breaks,best_tangga_eropa)
best_tangga_eropa

##   breaks     rmse      mae
## 1      2 5.462277 4.313951
## 2      3 5.395409 4.482648
## 3      4 5.226264 4.308287
## 4      5 4.963259 3.982939
## 5      6 5.133934 4.101179
## 6      7 4.949469 3.951007
## 7      8 5.116465 4.154967
## 8      9 5.225155 4.293021

Berdasarkan rmse dan mae

best_tangga_eropa %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      7 4.949469 3.951007

best_tangga_eropa %>% slice_min(mae)

##   breaks     rmse      mae
## 1      7 4.949469 3.951007

Plot CV

best_tangga_eropa$breaks <- as.factor(best_tangga_eropa$breaks)

ggplot(data = best_tangga_eropa, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc5<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,5), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=5") +
  xlab("Horse Power") + ylab("mile per gallon")

pc7<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,7), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=7") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc5, pc7, ncol = 2, nrow = 1)

Summary Model

tangga_eropa = lm(mpg ~ cut(horsepower,7))
summary(tangga_eropa)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.5280     0.5069   64.17   <2e-16 ***
## cut(horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

df <- 2:15

best_spline3_eropa <- map_dfr(df, function(i) {
  metric_spline3_eropa <- map_dfr(cross_val_eropa$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoEropa[x$in_id,])
                              pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
                              truth <- AutoEropa[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_eropa

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_eropa <- colMeans(metric_spline3_eropa)
  mean_metric_spline3_eropa
  }
)

best_spline3_eropa <- cbind(df=df,best_spline3_eropa)
best_spline3_eropa

##    df     rmse      mae
## 1   2 4.807144 3.901102
## 2   3 4.830632 3.946643
## 3   4 4.921306 4.013528
## 4   5 5.000935 4.056378
## 5   6 5.065292 4.049801
## 6   7 5.126662 4.096608
## 7   8 5.217695 4.177479
## 8   9 5.382172 4.306452
## 9  10 5.465415 4.374886
## 10 11 5.564501 4.422779
## 11 12 5.537463 4.408483
## 12 13 5.513853 4.343441
## 13 14 5.588090 4.477874
## 14 15 5.659180 4.506912

Berdasarkan rmse dan mae

best_spline3_eropa %>% slice_min(rmse)

##   df     rmse      mae
## 1  2 4.807144 3.901102

best_spline3_eropa %>% slice_min(mae)

##   df     rmse      mae
## 1  2 4.807144 3.901102

Plot CV

best_spline3_eropa$df <- as.factor(best_spline3_eropa$df)
ggplot(data=best_spline3_eropa, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoEropa$horsepower, df=2),"knots")

##  50% 
## 76.5

Plot

ncs <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="blue")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=2), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =2") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(76.5), col="black", lty=2)
ncs

Summary Model

regspline_eropa = lm(mpg ~ ns(horsepower, df=2))
summary(regspline_eropa)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8454  -2.4171  -0.1837   2.2563  15.8921 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              38.3090     0.6779   56.52   <2e-16 ***
## ns(horsepower, df = 2)1 -41.4766     1.5831  -26.20   <2e-16 ***
## ns(horsepower, df = 2)2 -16.0003     1.2308  -13.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.354 on 389 degrees of freedom
## Multiple R-squared:  0.6904, Adjusted R-squared:  0.6888 
## F-statistic: 433.7 on 2 and 389 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_eropa <- rbind(best_poly_eropa %>% select(-1) %>% slice_min(rmse),
                      best_tangga_eropa %>% select(-1) %>% slice_min(rmse),
                      best_spline3_eropa %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_eropa <- c("Polinomial",
                        "Piecewise Constant",
                        "Natural Cubic Splines")

komparasi_eropa <- data.frame(nama_model_eropa,nilai_metric_eropa)
komparasi_eropa

##        nama_model_eropa     rmse      mae
## 1            Polinomial 4.805676 3.906728
## 2    Piecewise Constant 4.949469 3.951007
## 3 Natural Cubic Splines 4.807144 3.901102

Berdasarkan Plot

rl_E <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "blue") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc7 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="blue") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "black", se = F)+
  theme_bw()

plot_grid(rl_E, poly2, pc7, nc2, labels = c("rl_E", "poly2","pc7", "nc2"), label_size = 6)

Data Jepang

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

ordo <- 2:10

best_poly_jepang <- map_dfr(ordo, function(i) {
  metric_poly_jepang <- map_dfr(cross_val_jepang$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),
                                        data=AutoJepang[x$in_id,])
                              pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
                              truth <- AutoJepang[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly_jepang

# menghitung rata-rata untuk 10 folds
  mean_metric_poly_jepang <- colMeans(metric_poly_jepang)
  mean_metric_poly_jepang
  }
)

best_poly_jepang <- cbind(ordo=ordo,best_poly_jepang)
best_poly_jepang

##   ordo      rmse       mae
## 1    2  4.522705  3.624496
## 2    3  3.962162  3.024320
## 3    4  4.380986  3.311081
## 4    5  3.963580  3.013920
## 5    6  5.441724  3.816592
## 6    7  8.538692  5.108307
## 7    8 12.147461  6.621785
## 8    9 18.314340  9.158679
## 9   10 33.718613 15.414116

Berdasarkan rmse dan mae

best_poly_jepang %>% slice_min(rmse)

##   ordo     rmse     mae
## 1    3 3.962162 3.02432

best_poly_jepang %>% slice_min(rmse)

##   ordo     rmse     mae
## 1    3 3.962162 3.02432

Plot CV

best_poly_jepang$ordo <- as.factor(best_poly_jepang$ordo)
ggplot(data=best_poly_jepang, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

M <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

N <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 5") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(M, N, ncol = 2, nrow = 1)

Summary Model

polinom_jepang = lm(mpg ~ poly(horsepower,3))
summary(polinom_jepang)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7039  -2.4491  -0.1519   2.2035  15.8159 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.446      0.221 106.105   <2e-16 ***
## poly(horsepower, 3)1 -120.138      4.375 -27.460   <2e-16 ***
## poly(horsepower, 3)2   44.090      4.375  10.078   <2e-16 ***
## poly(horsepower, 3)3   -3.949      4.375  -0.903    0.367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.375 on 388 degrees of freedom
## Multiple R-squared:  0.6882, Adjusted R-squared:  0.6858 
## F-statistic: 285.5 on 3 and 388 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

breaks <- 2:5

best_tangga_jepang <- map_dfr(breaks, function(i){
  metric_tangga_jepang <- map_dfr(cross_val_jepang$splits,
                           function(x){
                             training <- AutoJepang[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- AutoJepang[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga_jepang

# menghitung rata-rata untuk 10 folds
mean_metric_tangga_jepang <- colMeans(metric_tangga_jepang)

mean_metric_tangga_jepang
})
best_tangga_jepang <- cbind(breaks=breaks,best_tangga_jepang)
best_tangga_jepang

##   breaks     rmse      mae
## 1      2 4.306154 3.254086
## 2      3 4.201949 3.360962
## 3      4 4.072662 3.248551
## 4      5 4.226607 3.410855

Berdasarkan rmse dan mae

best_tangga_jepang %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      4 4.072662 3.248551

best_tangga_jepang %>% slice_min(mae)

##   breaks     rmse      mae
## 1      4 4.072662 3.248551

Plot CV

best_tangga_jepang$breaks <- as.factor(best_tangga_jepang$breaks)

ggplot(data = best_tangga_jepang, aes(x = breaks, group = 1)) +
  geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
  geom_point(aes(y = rmse), color = "blue") +
  geom_line(aes(y = mae), linetype = "dashed", color = "red") +
  geom_point(aes(y = mae), color = "red") +
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") +
  ylab("Metric Value")

Plot

pc4<- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,4), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=4") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc4, ncol = 1, nrow = 1)

Summary Model

tangga_jepang = lm(mpg ~ cut(horsepower,4))
summary(tangga_jepang)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 4))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0533  -2.7826  -0.2927   2.8593  17.5467 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  29.0533     0.3581   81.12   <2e-16 ***
## cut(horsepower, 4)(92,138]   -8.4506     0.5946  -14.21   <2e-16 ***
## cut(horsepower, 4)(138,184] -14.2707     0.7005  -20.37   <2e-16 ***
## cut(horsepower, 4)(184,230] -16.2004     1.2647  -12.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.001 on 388 degrees of freedom
## Multiple R-squared:  0.5926, Adjusted R-squared:  0.5894 
## F-statistic: 188.1 on 3 and 388 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

df <- 2:15

best_spline3_jepang <- map_dfr(df, function(i) {
  metric_spline3_jepang <- map_dfr(cross_val_jepang$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoJepang[x$in_id,])
                              pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
                              truth <- AutoJepang[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3_jepang

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3_jepang <- colMeans(metric_spline3_jepang)
  mean_metric_spline3_jepang
  }
)

best_spline3_jepang <- cbind(df=df,best_spline3_jepang)
best_spline3_jepang

##    df     rmse      mae
## 1   2 4.537131 3.647191
## 2   3 4.143445 3.215053
## 3   4 4.215103 3.264447
## 4   5 4.182602 3.158549
## 5   6 4.186622 3.186509
## 6   7 4.189215 3.225135
## 7   8 4.341229 3.347164
## 8   9 4.389358 3.393529
## 9  10 4.405940 3.369006
## 10 11 4.474687 3.458585
## 11 12 4.429249 3.448116
## 12 13 4.445900 3.402477
## 13 14 4.560086 3.583991
## 14 15 4.540602 3.527475

Berdasarkan rmse dan mae

best_spline3_jepang %>% slice_min(rmse)

##   df     rmse      mae
## 1  3 4.143445 3.215053

best_spline3_jepang %>% slice_min(mae)

##   df     rmse      mae
## 1  5 4.182602 3.158549

Plot CV

best_spline3_jepang$df <- as.factor(best_spline3_jepang$df)
ggplot(data=best_spline3_jepang, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Knots

attr(ns(AutoJepang$horsepower, df=3),"knots")

## 33.33333% 66.66667% 
##        68        90

Plot

ncs <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="green")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=3), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =3") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(68, 90), col="grey50", lty=2)
ncs

Summary Model

regspline_jepang = lm(mpg ~ ns(horsepower, df=3))
summary(regspline_jepang)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8308  -2.4068  -0.1672   2.2524  15.9184 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               38.276      1.039   36.85   <2e-16 ***
## ns(horsepower, df = 3)1  -21.311      1.054  -20.22   <2e-16 ***
## ns(horsepower, df = 3)2  -35.274      2.313  -15.25   <2e-16 ***
## ns(horsepower, df = 3)3  -19.669      1.435  -13.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.362 on 388 degrees of freedom
## Multiple R-squared:  0.6901, Adjusted R-squared:  0.6877 
## F-statistic:   288 on 3 and 388 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric_jepang <- rbind(best_poly_jepang %>% select(-1) %>% slice_min(rmse),
                      best_tangga_jepang %>% select(-1) %>% slice_min(rmse),
                      best_spline3_jepang %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model_jepang <- c("Polinomial",
                       "Piecewise Constant",
                       "Natural Cubic Splines")

komparasi_jepang <- data.frame(nama_model_jepang,nilai_metric_jepang)
komparasi_jepang

##       nama_model_jepang     rmse      mae
## 1            Polinomial 3.962162 3.024320
## 2    Piecewise Constant 4.072662 3.248551
## 3 Natural Cubic Splines 4.143445 3.215053

Berdasarkan Plot

rl_J <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color = "green") +
  theme_bw() +
  labs(y = "mpg", x = "Horse Power")

poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "black",se = F)+
  theme_bw()


pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) +
                 geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,3), 
               lty = 1, col = "black",se = F)+
  theme_bw()

nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg), add=TRUE) + 
  geom_point(alpha = 0.55, color="green") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(68, 90)), col = "black", se = FALSE) +
  geom_vline(xintercept = c(68, 90), col="black", lty=2)+
  theme_bw()

plot_grid(rl_J, poly3, pc4, nc3, labels = c("rl_J", "poly3","pc4", "nc3"), label_size = 6)

Tugas Sains Data_Baru

Yunia Hasnataeni

2023-11-23

SOAL

Library

Eksplorasi Data

Data Keseluruhan

Data Masing-Masing Negara

Metode Analisis

Data Keseluruhan

Regresi Polinomial

Piecewise Constant

Natural Cubic Splines

Komparasi Model

Data Amerika

Regresi Polinomial

Piecewise Constant

Natural Cubic Splines

Komparasi Model

Data Eropa

Regresi Polinomial

Piecewise Constant

Natural Cubic Splines

Komparasi Model

Data Jepang

Regresi Polinomial

Piecewise Constant

Natural Cubic Splines

Komparasi Model