Beyond Linearity

library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
library(ggpubr)
library(DT)
attach(Auto)

Plot Awal

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "black",se = F)+
  theme_bw()

Regresi Polinomial

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

ordo <- 2:10

best_poly <- map_dfr(ordo, function(i) {
  metric_poly <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ poly(horsepower,ordo=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_poly

# menghitung rata-rata untuk 10 folds
  mean_metric_poly <- colMeans(metric_poly)
  mean_metric_poly
  }
)

best_poly <- cbind(ordo=ordo,best_poly)

Plot CV

best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

Plot

A <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

B <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

ggarrange(A, B, ncol = 2, nrow = 1)

Jika dilihat berdasarkan plot, meskipun nilai RMSE Regresi Polinomial ordo 7 lebih kecil daripada ordo 2, model terbaik yang dipilih adalah Regresi Polinomial ordo 2. Karena untuk ordo 7 pada bagian ujung ujung plot cenderung tidak stabil.

Summary Model

polinom = lm(mpg ~ poly(horsepower,2))
summary(polinom)

Call:
lm(formula = mpg ~ poly(horsepower, 2))

Residuals:
     Min       1Q   Median       3Q      Max 
-14.7135  -2.5943  -0.0859   2.2868  15.8961 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)            23.4459     0.2209  106.13   <2e-16 ***
poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.374 on 389 degrees of freedom
Multiple R-squared:  0.6876,    Adjusted R-squared:  0.686 
F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:20

best_tangga <- map_dfr(breaks, function(i){
  metric_tangga <- map_dfr(cross_val$splits,
                           function(x){
                             training <- Auto[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- Auto[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,   newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae)
                             names(metric) <- c("rmse","mae")
                             return(metric)
                             }
                           )

metric_tangga

# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)

Plot CV

best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("MAE")

Plot

pc8<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=8") +
  xlab("Horse Power") + ylab("mile per gallon")

pc15<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,15), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("K=15") +
  xlab("Horse Power") + ylab("mile per gallon")

ggarrange(pc8, pc15, ncol = 2, nrow = 1)

Summary Model

tangga = lm(mpg ~ cut(horsepower,15))
summary(tangga)

Call:
lm(formula = mpg ~ cut(horsepower, 15))

Residuals:
     Min       1Q   Median       3Q      Max 
-13.0137  -2.5309  -0.3381   2.2360  14.9800 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                     34.1800     1.1077  30.857  < 2e-16 ***
cut(horsepower, 15)(58.3,70.5]  -0.6514     1.2472  -0.522    0.602    
cut(horsepower, 15)(70.5,82.8]  -6.1663     1.2601  -4.894 1.47e-06 ***
cut(horsepower, 15)(82.8,95.1]  -9.4160     1.1974  -7.864 3.97e-14 ***
cut(horsepower, 15)(95.1,107]  -13.2778     1.2756 -10.409  < 2e-16 ***
cut(horsepower, 15)(107,120]   -13.1421     1.3644  -9.632  < 2e-16 ***
cut(horsepower, 15)(120,132]   -16.8400     1.5665 -10.750  < 2e-16 ***
cut(horsepower, 15)(132,144]   -16.4600     1.5665 -10.507  < 2e-16 ***
cut(horsepower, 15)(144,156]   -19.3439     1.3184 -14.672  < 2e-16 ***
cut(horsepower, 15)(156,169]   -20.3425     1.8782 -10.831  < 2e-16 ***
cut(horsepower, 15)(169,181]   -20.3800     1.5665 -13.010  < 2e-16 ***
cut(horsepower, 15)(181,193]   -21.0550     2.4141  -8.722  < 2e-16 ***
cut(horsepower, 15)(193,205]   -21.8467     2.7133  -8.052 1.08e-14 ***
cut(horsepower, 15)(205,218]   -22.3800     2.2154 -10.102  < 2e-16 ***
cut(horsepower, 15)(218,230]   -20.1800     2.2154  -9.109  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.29 on 377 degrees of freedom
Multiple R-squared:  0.7087,    Adjusted R-squared:  0.6979 
F-statistic: 65.51 on 14 and 377 DF,  p-value: < 2.2e-16

Natural Cubic Splines

Cross Validation

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:15

best_spline3 <- map_dfr(df, function(i) {
  metric_spline3 <- map_dfr(cross_val$splits,
                            function(x) {
                              mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
                              pred <- predict(mod, newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              rmse <- mlr3measures::rmse(truth = truth, response = pred)
                              mae <- mlr3measures::mae(truth = truth, response = pred)
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                              }
                            )
  metric_spline3

# menghitung rata-rata untuk 10 folds
  mean_metric_spline3 <- colMeans(metric_spline3)
  mean_metric_spline3
  }
)

best_spline3 <- cbind(df=df,best_spline3)

Plot CV

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

Plot

ncs <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="purple")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,col = "black", se=F)+
    theme_bw()+
    ggtitle("DF =10") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col="grey50", lty=2)
ncs

Knots

attr(ns(Auto$horsepower, df=10),"knots")
  10%   20%   30%   40%   50%   60%   70%   80%   90% 
 67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7 

Summary Model

regspline = lm(mpg ~ ns(horsepower, df=10))
summary(regspline)

Call:
lm(formula = mpg ~ ns(horsepower, df = 10))

Residuals:
     Min       1Q   Median       3Q      Max 
-14.5003  -2.4956  -0.2086   2.2577  14.6400 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 32.462      1.680  19.324  < 2e-16 ***
ns(horsepower, df = 10)1    -4.600      1.789  -2.572 0.010495 *  
ns(horsepower, df = 10)2    -3.007      2.555  -1.177 0.239996    
ns(horsepower, df = 10)3    -8.663      2.043  -4.239 2.82e-05 ***
ns(horsepower, df = 10)4    -7.170      2.188  -3.278 0.001142 ** 
ns(horsepower, df = 10)5   -14.722      2.125  -6.927 1.84e-11 ***
ns(horsepower, df = 10)6    -8.446      2.467  -3.424 0.000684 ***
ns(horsepower, df = 10)7   -16.947      2.080  -8.148 5.38e-15 ***
ns(horsepower, df = 10)8   -21.715      1.891 -11.483  < 2e-16 ***
ns(horsepower, df = 10)9   -14.384      4.116  -3.494 0.000531 ***
ns(horsepower, df = 10)10  -22.326      1.946 -11.476  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.24 on 381 degrees of freedom
Multiple R-squared:  0.7124,    Adjusted R-squared:  0.7049 
F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric <- rbind(mean_metric_reg,
                      best_poly %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_spline3 %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Regresi",
                "Polinomial",
                "Piecewise Constant",
                "Natural Cubic Splines")

komparasi <- data.frame(nama_model,nilai_metric)

Berdasarkan Plot

Subset Data

Amerika

Plot Data

Komparasi Model Berdasarkan Nilai RMSE dan MAE

Komparasi Model Berdasarkan Plot

Eropa

Plot Data

Komparasi Model Berdasarkan Nilai RMSE dan MAE

Komparasi Model Berdasarkan Plot

Jepang

Plot Data

    rmse      mae 
4.454860 3.568692 

Komparasi Model Berdasarkan Nilai RMSE dan MAE

Komparasi Model Berdasarkan Plot

Referensi

Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/

James, Gareth. et.al. (2013). An Introduction to Statistical Learning With Aplication in R. New York: Springer.

Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear