library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
library(ggpubr)
library(caret)
library(DT)
library(corrplot)
library(gridExtra)
library(kableExtra)
library(magick)

Pendahuluan

Data yang digunakan merupakan data Auto dari package ISLR. Data Auto merupakan data yang fokusnya adalah untuk memprediksi konsumsi bahan bakar kendaraan dalam satuan mile per galon (mpg). Terdapat 392 observasi (sebelumnya 398 observasi, namun terdapat 6 data missing value yang telah didrop), dan 9 variabel. Berikut merupakan peubah dan tipe datanya :

  1. mpg : continuous
  2. cylinders: multi-valued discrete
  3. displacement: continuous
  4. horsepower: continuous
  5. weight: continuous
  6. acceleration: continuous
  7. model year: multi-valued discrete
  8. origin: multi-valued discrete
  9. car name: string (unique for each instance)

Berikut merupakan tampilan dari data Auto

df <- ISLR::Auto
datatable(df)

Visualisasi Pola Hubungan

Berikut ini merupakan visualisasi pola hubungan peubah yang terdapat pada data Auto

plot(df)

Berdasarkan visualisasi diatas khususnya pada peubah mpg, kita bisa melihat antara mpg dan peubah lainnya memiliki hubungan yang tidak linear.

g1 <- ggplot(df, aes(x = factor(cylinders), y = mpg, fill=factor(cylinders))) +
  geom_boxplot()

g2 <- ggplot(df, aes(x = displacement, y = mpg)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

g3 <- ggplot(df, aes(x = horsepower, y = mpg)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

g4 <- ggplot(df, aes(x = weight, y = mpg)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

g5 <- ggplot(df, aes(x = acceleration, y = mpg)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

g6 <- ggplot(df, aes(x = year, y = mpg)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

g7 <- ggplot(Auto,aes(y = mpg, x = factor(origin), fill=factor(origin))) + 
    geom_boxplot() +
    ylab("mpg") + xlab("Origin") +
    scale_fill_discrete(name="Origin", labels=c("Amerika","Eropa","Jepang")) +
  theme_bw()

gridExtra::grid.arrange (g1, g2, g3, g4, g5, g6, g7, ncol = 2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Jika kita tampilkan hubungan mpg dengan peubah lainnya dengan menambahkan unsur geom_smooth(), kita bisa lebih jelas melihat bahwa pola hubungan yang terjadi antara mpg dan peubah prediktornya adalah hubungan yang tidak linear.

Korelasi Antar Peubah

correlations = cor(df[-9])
corrplot(correlations)

Apabila kita lihat nilai korelasinya, peubah yang memiliki korelasi yang kuat dengan mpg adalah peubah cylinders, displacements, horsepower dan weight. Keempat peubah prediktor tersebut memiliki korelasi yang negatif terhadap mpg.

Pada data yang memiliki pola hubungan yang tidak linear, jika pemodelan dengan regresi linear akan menjadi tidak efisien, karena tujuan dari meminimumkan error akan tidak tercapai. Sehingga dalam prakteknya diperlukan suatu metode pemodelan tak linear. Pendugaan model bertujuan sebagai explanatory analysis atau untuk melihat pola hubungan yang terjadi pada peubah respon dan prediktor.

Pada analisis ini, peubah yang digunakan sebagai respon adalah mpg dan peubah prediktor nya adalah weight, displacement, acceleration dan horsepower.

Agar lebih tergambar jelas, berikut disajikan plot pasangan antara mpg dengan dengan masing - masing peubah prediktornya.

#Plot Mpg dan Weight
a <- ggplot(df, aes(x = weight, y =mpg)) + 
  geom_point(alpha=0.55, color= "#00CC99") +
  geom_smooth()+
  ggtitle("Plot Mpg dan Weight")+
   theme_bw()+
 theme(plot.title = element_text(size=10))

#Plot Mpg dan displacement
b <- ggplot(df, aes(x = displacement, y =mpg)) + 
  geom_point(alpha=0.55, color= "#00CC99") +
  geom_smooth()+
  ggtitle("Plot Mpg dan displacement")+
   theme_bw()+
    theme(plot.title = element_text(size=10))

# Plot Mpg dan horsepower
c <- ggplot(df, aes(x = horsepower, y =mpg)) + 
   geom_point(alpha=0.55, color= "#00CC99") +
   geom_smooth()+
  ggtitle("Plot Mpg dan horsepower")+
    theme_bw()+
    theme(plot.title = element_text(size=10))


#Plot Mpg dan acceleration
d <- ggplot(df, aes(x = acceleration, y =mpg)) + 
  geom_point(alpha=0.55, color= "#00CC99") +
  geom_smooth()+
   ggtitle("Plot Mpg dan acceleration")+
   theme_bw()+
    theme(plot.title = element_text(size=10))


ggarrange(a, b, c, d, ncol = 2, nrow=2)

Visualisasi pola hubungan antara mpg dengan weight, displacement, acceleration dan horsepower Berdasarkan scatterplot diatas menunjukkan bahwa pola hubungan yang terbentuk tidak linear sehingga akan dilakukan pemodelan tak linear yang dapat merepresentasikan pola hubungan tersebut.

Pemodelan Tak Linear

Pemodelan tak linear yang akan dicobakan yaitu :

A. Polynomial Regression

Derajat polinomial yang dianalisis pada derajat (d) 1,2, 3 dan 4 saja, karena pada regresi polynomial, derajat yang digunakan biasanya maksimal sebanyak 4. Jika d > 4 maka kurva yang terbentuk akan kurang beraturan terutama pada bagian ujung-ujungnya.

B. Picewise Constant

C. Basic Spline

D. Natural Spline

Agar mendapatkan nilai turning parameter optimum pada setiap model, maka dilakukan Cross Validation (CV). Tipe CV yang digunakana yakni k -folds dengan k = 10 dan ulangan sebanyak 3 kali. Selanjutnya, pemilihan model terbaik pada setiap pasangan mpg dan prediktornya didasari oleh kriteria nilai RMSE dan/atau MAE terkecil.

A. MPG Vs Weight

Polynomial regression

Secara Visual

g1 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") + 
  labs(title = "Polynomial Degree = 1")+
  theme_bw()+ theme(plot.title = element_text(size=9))

g2 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Degree = 2")+
  theme_bw()+ theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Degree = 3")+
  theme_bw()+ theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Degree = 4")+
  theme_bw() +
  theme(plot.title = element_text(size=9))


ggarrange(g1, g2, g3, g4, ncol = 2, nrow=2)

Secara visual, model polynomial berderajat 2 dan 3 terlihat lebih dapat menggambarkan pola hubungan data antara mpg dan weight dibandingkan model polynomial lainnya.

Secara Empiris

#Fungsi Pemilihan Model dengan K-Folds CV

train_control <- trainControl(method = "repeatedcv",
                              number = 10, repeats = 3)

cv_polynom <- function(df,i){
  f <- bquote(mpg ~ poly(weight, degree = .(i)))
  set.seed(581)
  model_poly <-  train(as.formula(f),             
                       data = df,                        
                       trControl = train_control,         
                       method = "lm",                      
                       na.action = na.pass)  
  predict <- predict(model_poly, df)
  metric_model_poly <- model_poly$resample[,-4]
  poly <- apply(metric_model_poly,2,mean)
  return( metric_model_poly)
}

Plot CV Berdasarkan hasil Cross Validation, maka diperoleh derajat yang optimum adalah

mae <- c()
rmse <- c()
 for ( i in 1:4){
 temp_er <- cv_polynom(df,i)
 rmse[i] <- mean(temp_er$RMSE)
 mae [i] <- mean(temp_er$MAE)
 }

#RMSE
data.frame(degree = 1:4, rmse) %>%
  mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
  ggplot(aes(x = degree, y = rmse)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_RMSE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 5, 0.05)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none") +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'weight' polynomial degree dengan RMSE",
       x = "Degree",
       y = "CV RMSE")

#MAE
data.frame(degree = 1:4, mae) %>%
  mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
  ggplot(aes(x = degree, y = mae)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_MAE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 5, 0.05)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none") +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'weight' polynomial degree dengan MAE",
       x = "Degree",
       y = "CV MAE")

Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model polynomial dengan derajat 2 memiliki nilai RMSE dan MAE terkecil.

Model Terbaik

ggplot(df,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("Weight") + 
  ylab("mile per gallon")

Picewise constant regression

Secara Visual

g2 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") + 
  labs(title = "Breaks= 2")+
   theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") + 
  labs(title = "Breaks= 3")+
   theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") + 
  labs(title = "Breaks= 4")+
   theme(plot.title = element_text(size=9))

g5 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") + 
  labs(title = "Breaks= 5")+
   theme(plot.title = element_text(size=9))

g6 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") + 
  labs(title = "Breaks= 6")+
   theme(plot.title = element_text(size=9))

g7 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") + 
  labs(title = "Breaks= 7")+
   theme(plot.title = element_text(size=9))

g8 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") + 
  labs(title = "Breaks= 8")+
   theme(plot.title = element_text(size=9))
   
g9 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") + 
  labs(title = "Breaks= 9")+
   theme(plot.title = element_text(size=9))

g10 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") + 
  labs(title = "Breaks= 10")+
   theme(plot.title = element_text(size=9))
  theme_bw()

grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)

Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 6 dan 7 yang lebih menggambarkan pola hubungan data antara mpg dan weight.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,repeats = 3, strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- df[x$in_id,]
training$weight <- cut(training$weight,i)

mod <- lm(mpg ~ weight,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df[-x$in_id,]

weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
datatable(best_tangga)

Plot CV Piecewise

#Plot PC_Weight_Berdasarkan kriteria RMSE

pcw <- best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga[which.min(best_tangga$rmse), ], color="#00CC99", 
               size=3) +
  labs(title = "MPG vs Weight - Piecewise Regression",
       subtitle = "Memilih 'weight' piecewise dengan RMSE",
       x = "Breaks",
       y = "CV RMSE")+
  theme_bw ()

#Plot PC_Weight_Berdasarkan kriteria MAE 

pcw2 <- best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga[which.min(best_tangga$mae), ], color="#00CC99", 
               size=3) +
   labs(title = "MPG vs Weight - Piecewise Regression",
       subtitle = "Memilih 'weight' piecewise dengan MAE",
       x = "Breaks",
       y = "CV MAE")+ 
  theme_bw ()

#ggarrange(pcw, pc2, ncol=2, nrow=1)

Berdasarkan k fold cv, didapatkan Model Piecewise dengan knot = 6 yang memiliki RMSE dan MAE terkecil.

Model Terbaik Piecewise Constant

ggplot(df,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,6,raw=T), 
              lty = 1, col = "darkblue",se = F)+
  theme_bw()+
 labs(title = "MPG vs Weight - Piecewise Regression K = 6",
       x = "Weight",
       y = "mile per gallon")

Basic Spline

Secara Visual

g1 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") + 
  labs(title = "df = 4 (1 knots)")+ 
  theme_bw()

g2 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") + 
  labs(title = "df = 5 (2 knot)")+  theme_bw()

g3 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") + 
  labs(title = "df = 6 (3 knots)")+  theme_bw()

g4 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") + 
  labs(title = "df = 7 (4 knots)") +  theme_bw()

g5 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") + 
  labs(title = "df = 8 (5 knots)") +  theme_bw()

g6 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") + 
  labs(title = "df = 9 (6 knots)") +  theme_bw()

g7 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") + 
  labs(title = "df = 10 (7 knots)") +  theme_bw()

g8 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") + 
  labs(title = "df = 11 (8 knots)")  +  theme_bw()

g9 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") + 
  labs(title = "df = 12 (9 knots)") +  theme_bw()

g10 <- ggplot(df, aes(x = weight, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") + 
  labs(title = "df = 13 (10 knots)") +  theme_bw()

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, nrow = 3)

Secara visual, model basic spline dengan df = 5 dan 6 lebih nampak mulus, dan diujung ujung kurva tampak lebih teratur dan tidak liar.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,  repeats=5, strata = "mpg")

dg <- 2:12

best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
    function(x){
                mod <- lm(mpg ~ bs(weight,df=i),
                         data=df[x$in_id,])
                
                pred <- predict(mod,
                               newdata=df[-x$in_id,])
                
                truth <- df[-x$in_id,]$mpg
                
                rmse <- mlr3measures::rmse(truth = truth,
                                           response = pred
                                           )
                mae <- mlr3measures::mae(truth = truth,
                                           response = pred
                                           )
                metric <- c(rmse,mae)
                names(metric) <- c("rmse","mae")
                return(metric)
}
)

metric_bspline

# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)

mean_metric_bspline
}
)

best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)

Plot CV BS

Berdasarkan RMSE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model basic spline dengan knot = 9 memiliki nilai RMSE dan MAE terkecil.

Plot Basic Spline*

Menentukan titik knots

df_9<-attr(bs(df$weight, df=9),"knots")
df_9
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286

Membuat Plot

ggplot(Auto,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, df=9), 
               lty = 1, se = F) +
  ggtitle("B-Spline (df=9)") +
  xlab("Weight") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = c(2074.9, 2285.5, 2635, 2896.6, 3446.1, 4096.3), col="grey50", lty=2)

Natural Spline

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")

dg <- 2:20

best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(weight,df=i),
         data=df[x$in_id,])
pred <- predict(mod,
               newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)

Plot CV NS

Berdasarkan RMSE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Natural Spline*

Menentukan titik knots

df_7<-attr(ns(df$weight, df=7),"knots")
df_7
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286

Membuat Plot

ggplot(Auto,aes(x=weight, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1, se = F) +
  ggtitle("Natural Splines (df=7)") +
  xlab("Weight") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = c(2074.9, 2285.5, 2635, 2896.6, 3446.1, 4096.3), col="grey50", lty=2)

Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model natural spline dengan knot = 7 memiliki nilai RMSE dan MAE terkecil.

Model Terbaik

Komparasi model terbaik dari pola hubungan antara mpg dan weight dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :

Model Basic Spline memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model Basic Spline dengan df 9 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan weight dibandingkan dengan model lainnya.

B. MPG Vs Displacement

Secara Visual

g1 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") + 
  labs(title = "Polynomial Derajat = 1")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g2 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Derajat = 2")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Derajat = 3")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Derajat = 4")+
  theme_bw() +
  theme(plot.title = element_text(size=9))


ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))

Secara visual, model polynomial berderajat 2 dan 3 terlihat lebih dapat menggambarkan pola hubungan data antara mpg dan displacement dibandingkan model polynomial lainnya.

Secara Empiris

Polynomial Regression

Pemilihan Model dengan K-Folds CV

train_control <- trainControl(method = "repeatedcv",
                              number = 10, repeats = 3)

cv_polynomd <- function(df,i){
  f <- bquote(mpg ~ poly(displacement, degree = .(i)))
  set.seed(581)
  model_poly <-  train(as.formula(f),             
                       data = df,                        
                       trControl = train_control,         
                       method = "lm",                      
                       na.action = na.pass)  
  predict <- predict(model_poly, df)
  metric_model_poly <- model_poly$resample[,-4]
  #poly <- apply(metric_model_poly,2,mean)
  return( metric_model_poly)
}

Plot CV

mae <- c()
rmse <- c()
 for ( i in 1:4){
 temp_er <- cv_polynomd(df,i)
 rmse[i] <- mean(temp_er$RMSE)
 mae [i] <- mean(temp_er$MAE)
 }

#RMSE
data.frame(degree = 1:4, rmse) %>%
  mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
  ggplot(aes(x = degree, y = rmse)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_RMSE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 5, 0.05)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none") +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'displacement' polynomial degree dengan RMSE",
       x = "Degree",
       y = "CV RMSE")

#MAE
data.frame(degree = 1:4, mae) %>%
  mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
  ggplot(aes(x = degree, y = mae)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_MAE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 5, 0.05)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none") +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'displacement' polynomial degree dengan MAE",
       x = "Degree",
       y = "CV MAE")

Model Terbaik Polynomial

Secara empiris, model polynomial 2 memiliki nilai RMSE dan MAE terkecil

ggplot(df,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("displacement") + 
  ylab("mile per gallon")

Secara empiris, berdasarkan k fold cv, didapatkan Model Polynomial derajat 2 yang memiliki RMSE dan MAE terkecil.

Picewise constant

Secara Visual

g2 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") + 
  labs(title = "Polynomial Degree = 2")+
   theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") + 
  labs(title = "Polynomial Degree = 3")+
   theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") + 
  labs(title = "Polynomial Degree = 4")+
   theme(plot.title = element_text(size=9))

g5 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") + 
  labs(title = "Polynomial Degree = 5")+
   theme(plot.title = element_text(size=9))

g6 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") + 
  labs(title = "Polynomial Degree = 6")+
   theme(plot.title = element_text(size=9))

g7 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") + 
  labs(title = "Polynomial Degree = 7")+
   theme(plot.title = element_text(size=9))

g8 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") + 
  labs(title = "Polynomial Degree = 8")+
   theme(plot.title = element_text(size=9))
   
g9 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") + 
  labs(title = "Polynomial Degree = 9")+
   theme(plot.title = element_text(size=9))

g10 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") + 
  labs(title = "Polynomial Degree = 10")+
   theme(plot.title = element_text(size=9))

grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)

Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun di tengah dan diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 6 dan 7 yang lebih menggambarkan pola hubungan data antara mpg dan displacement.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,strata = "mpg")

breaks <- 2:9

best_tangga_dis <- map_dfr(breaks, function(i){

metric_tangga_dis <- map_dfr(cross_val$splits,
    function(x){

training <- df[x$in_id,]
training$displacement <- cut(training$displacement,i)

mod <- lm(mpg ~ displacement,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df[-x$in_id,]

displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_dis



# menghitung rata-rata untuk 10 folds
mean_metric_tangga_dis <- colMeans(metric_tangga_dis)

mean_metric_tangga_dis
}
)
best_tangga_dis <- cbind(breaks=breaks,best_tangga_dis)
# menampilkan hasil all breaks
datatable(best_tangga_dis)

Plot CV

#Plot PC_dis_Berdasarkan kriteria RMSE 
best_tangga_dis
##   breaks     rmse      mae
## 1      2 5.927553 4.645127
## 2      3 4.912809 3.680623
## 3      4 4.825994 3.635968
## 4      5 4.681469 3.500329
## 5      6 4.603800 3.415141
## 6      7 4.571606 3.376004
## 7      8 4.402890 3.251622
## 8      9 4.233686 3.112410
best_tangga_dis$breaks <- as.factor(best_tangga_dis$breaks)
ggplot(data=best_tangga_dis, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_dis[which.min(best_tangga_dis$rmse), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- RMSE Criteria") +
  labs(caption= "Data dis") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error") +
  theme_bw ()

#Plot PC_dis_Berdasarkan kriteria MAE 

best_tangga_dis$breaks <- as.factor(best_tangga_dis$breaks)
ggplot(data=best_tangga_dis, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_dis[which.min(best_tangga_dis$mae), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- MAE Criteria") +
    labs(caption= "Data dis")+
  xlab("Breaks") + 
  ylab(" Mean Absolute Error") +
  theme_bw ()

Model Terbaik Piecewise

 ggplot(df,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9,raw=T), 
              lty = 1, col = "darkblue",se = F)+
  theme_bw()+
  ggtitle("Regresi Piecewise Constant K= 9") +
  xlab("Displacement") + 
  ylab("mile per gallon")

Berdasarkan k fold cv, didapatkan Model Piecewise dengan knot = 9 yang memiliki RMSE dan MAE terkecil.

Basic Spline

Secara Visual

g1 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") + 
  labs(title = "df = 4 (0 knots)")+ 
  theme_bw()

g2 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") + 
  labs(title = "df = 5 (1 knot)")+  theme_bw()

g3 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") + 
  labs(title = "df = 6 (2 knots)")+  theme_bw()

g4 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") + 
  labs(title = "df = 7 (3 knots)") +  theme_bw()

g5 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") + 
  labs(title = "df = 8 (4 knots)") +  theme_bw()

g6 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") + 
  labs(title = "df = 9 (5 knots)") +  theme_bw()

g7 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") + 
  labs(title = "df = 10 (6 knots)") +  theme_bw()

g8 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") + 
  labs(title = "df = 11 (7 knots)")  +  theme_bw()

g9 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") + 
  labs(title = "df = 12 (8 knots)") +  theme_bw()

g10 <- ggplot(df, aes(x = displacement, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") + 
  labs(title = "df = 13 (9 knots)") +  theme_bw()

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)

Secara visual, model basic spline dengan df = 5 lebih nampak mulus, dan diujung ujung kurva tampak lebih teratur dan tidak liar. Pada model dengan df > 6, tampak kurva pada titik boundary knots sebelah kiri yang cukup liar.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,  repeats=5, strata = "mpg")

dg <- 2:10

best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
    function(x){
                mod <- lm(mpg ~ bs(displacement,df=i),
                         data=df[x$in_id,])
                
                pred <- predict(mod,
                               newdata=df[-x$in_id,])
                
                truth <- df[-x$in_id,]$mpg
                
                rmse <- mlr3measures::rmse(truth = truth,
                                           response = pred
                                           )
                mae <- mlr3measures::mae(truth = truth,
                                           response = pred
                                           )
                metric <- c(rmse,mae)
                names(metric) <- c("rmse","mae")
                return(metric)
}
)

metric_bspline

# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)

mean_metric_bspline
}
)

best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)

Plot CV BS

Berdasarkan RMSE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Basic Spline*

Menentukan titik knots

df_6<-attr(bs(df$displacement, df=6),"knots")
df_6
##    25%    50%    75% 
## 105.00 151.00 275.75

Membuat Plot

ggplot(Auto,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, df=6), 
               lty = 1, se = F) +
  ggtitle("B-Spline (df=6)") +
  xlab("Displacement") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = c(105, 151, 275.75), col="grey50", lty=2)

Berdasarkan k fold cv, model bspline yang memiliki RMSE dan MAE terkecil terdapat pada knot = 6

Natural Spline

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")

dg <- 2:20

best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
         data=df[x$in_id,])
pred <- predict(mod,
               newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)

Plot CV NS

Berdasarkan RMSE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Natural Spline

Menentukan titik knots

df_12d <-attr(ns(df$displacement, df=12),"knots")
df_12d
## 8.333333% 16.66667%       25% 33.33333% 41.66667%       50% 58.33333% 66.66667% 
##   89.0000   97.0000  105.0000  119.0000  130.9167  151.0000  200.0000  232.0000 
##       75% 83.33333% 91.66667% 
##  275.7500  318.0000  351.0000

Membuat Plot

ggplot(Auto,aes(x=displacement, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=12), 
               lty = 1, se = F) +
  ggtitle("Natural Splines (df=12)") +
  xlab("Displacement") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = df_12d, col="grey50", lty=2)

Model Terbaik

Komparasi model terbaik dari pola hubungan antara mpg dan displalacement dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :

Model Natural Spline memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model natural spline dengan df 12 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan displacement dibandingkan dengan model lainnya.

C. MPG Vs Acceleration

Polynomial Regression

Secara Visual

g1 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") + 
  labs(title = "Polynomial Derajat = 1")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g2 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Derajat = 2")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Derajat = 3")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Derajat = 4")+
  theme_bw() +
  theme(plot.title = element_text(size=9))


ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))

Secara visual, model polinomial derajat 3 dan 4 tampak dapat menggambarkan pola hubungan data mpg dan acceleration lebih baik dibandingkan dengan polinomial derajat lainnya.

Secara Empiris

train_control <- trainControl(method = "repeatedcv",
                              number = 10, repeats = 3)

cv_polynomda <- function(df,i){
  f <- bquote(mpg ~ poly(acceleration, degree = .(i)))
  set.seed(581)
  model_poly <-  train(as.formula(f),             
                       data = df,                        
                       trControl = train_control,         
                       method = "lm",                      
                       na.action = na.pass)  
  predict <- predict(model_poly, df)
  metric_model_poly <- model_poly$resample[,-4]
  #poly <- apply(metric_model_poly,2,mean)
  return( metric_model_poly)
}

Plot CV

mae <- c()
rmse <- c()
 for ( i in 1:4){
 temp_er <- cv_polynomda(df,i)
 rmse[i] <- mean(temp_er$RMSE)
 mae [i] <- mean(temp_er$MAE)
 }

#RMSE
pa <- data.frame(degree = 1:4, rmse) %>%
  mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
  ggplot(aes(x = degree, y = rmse)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_RMSE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 12, 0.02)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none",plot.title = element_text(size=10), plot.subtitle = element_text(size=8)) +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'acceleration' polynomial degree dengan RMSE",
       x = "Degree",
       y = "CV RMSE")

#MAE
pb <- data.frame(degree = 1:4, mae) %>%
  mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
  ggplot(aes(x = degree, y = mae)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_MAE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 10, 0.02)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none",plot.title = element_text(size=9),plot.subtitle = element_text(size=8) ) +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'acceleration' polynomial degree dengan MAE",
       x = "Degree",
       y = "CV MAE")

ggarrange(pa, pb, ncol = 2, nrow =1)

Berdasarkan k fold cv, didapatkan Model Polynomial dengan derajat = 4 yang memiliki RMSE dan MAE terkecil.

Model Terbaik Polynomal

ggplot(df,aes(x=acceleration, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("displacement") + 
  ylab("mile per gallon")

Piecewise constant regression

Secara Visual

g2 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") + 
  labs(title = "Breaks= 2")+
   theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") + 
  labs(title = "Breaks= 3")+
   theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") + 
  labs(title = "Breaks= 4")+
   theme(plot.title = element_text(size=9))

g5 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") + 
  labs(title = "Breaks= 5")+
   theme(plot.title = element_text(size=9))

g6 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") + 
  labs(title = "Breaks= 6")+
   theme(plot.title = element_text(size=9))

g7 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") + 
  labs(title = "Breaks= 7")+
   theme(plot.title = element_text(size=9))

g8 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") + 
  labs(title = "Breaks= 8")+
   theme(plot.title = element_text(size=9))
   
g9 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") + 
  labs(title = "Breaks= 9")+
   theme(plot.title = element_text(size=9))

g10 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") + 
  labs(title = "Breaks= 10")+
   theme(plot.title = element_text(size=9))

grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)

Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 4 dan 5 yang lebih menggambarkan pola hubungan data antara mpg dan acceleration.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,strata = "mpg")

breaks <- 2:10

best_tangga_ac <- map_dfr(breaks, function(i){

metric_tangga_ac <- map_dfr(cross_val$splits,
    function(x){

training <- df[x$in_id,]
training$acceleration <- cut(training$acceleration,i)

mod <- lm(mpg ~ acceleration,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df[-x$in_id,]

acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(acceleration=acceleration_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_ac



# menghitung rata-rata untuk 10 folds
mean_metric_tangga_ac <- colMeans(metric_tangga_ac)

mean_metric_tangga_ac
}
)
best_tangga_ac <- cbind(breaks=breaks,best_tangga_ac)
# menampilkan hasil all breaks
datatable(best_tangga_ac)

Plot CV_Acceleration

#Plot PC_acceleration_Berdasarkan kriteria RMSE

 best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_ac[which.min(best_tangga$rmse), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- RMSE Criteria") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error") +
  theme_bw ()

#Plot PC_Weight_Berdasarkan kriteria MAE 

 best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_ac[which.min(best_tangga$mae), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- MAE Criteria") +
  xlab("Breaks") + 
  ylab(" Mean Absolute Error") +
  theme_bw ()

Berdasarkan k fold cv, didapatkan Model Piecewise Constant dengan knot = 5 yang memiliki RMSE dan MAE terkecil.

Model Terbaik Piecewise Constant

ggplot(df,aes(x=acceleration, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,5,raw=T), 
              lty = 1, col = "darkblue",se = F)+
  theme_bw()+
  ggtitle("Regresi Piecewise Constant K= 5") +
  xlab("Acceleration") + 
  ylab("mile per gallon")

Basic Spline

Secar Visual

g1 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") + 
  labs(title = "df = 4 (1 knots)")+ 
  theme_bw()

g2 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") + 
  labs(title = "df = 5 (2 knot)")+  theme_bw()

g3 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") + 
  labs(title = "df = 6 (3 knots)")+  theme_bw()

g4 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") + 
  labs(title = "df = 7 (4 knots)") +  theme_bw()

g5 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") + 
  labs(title = "df = 8 (5 knots)") +  theme_bw()

g6 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") + 
  labs(title = "df = 9 (6 knots)") +  theme_bw()

g7 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") + 
  labs(title = "df = 10 (7 knots)") +  theme_bw()

g8 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") + 
  labs(title = "df = 11 (8 knots)")  +  theme_bw()

g9 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") + 
  labs(title = "df = 12 (9 knots)") +  theme_bw()

g10 <- ggplot(df, aes(x = acceleration, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") + 
  labs(title = "df = 13 (10 knots)") +  theme_bw()

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)

Secara visual, model basic spline dengan df = 7 lebih nampak mulus, dan ditengah maupun diujung ujung kurva tampak lebih teratur dan tidak liar. Model ini secara visual cukup baik menggambarkan pola hubungan data antara mpg dan acceleration.

Secara Empiris

set.seed(123)

cross_val <- vfold_cv(df,v=10,  repeats=5, strata = "mpg")

dg <- 2:10

best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
    function(x){
                mod <- lm(mpg ~ bs(acceleration,df=i),
                         data=df[x$in_id,])
                
                pred <- predict(mod,
                               newdata=df[-x$in_id,])
                
                truth <- df[-x$in_id,]$mpg
                
                rmse <- mlr3measures::rmse(truth = truth,
                                           response = pred
                                           )
                mae <- mlr3measures::mae(truth = truth,
                                           response = pred
                                           )
                metric <- c(rmse,mae)
                names(metric) <- c("rmse","mae")
                return(metric)
}
)

metric_bspline

# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)

mean_metric_bspline
}
)

best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)

Plot CV BS

Berdasarkan RMSE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan bukti empiris dengan repeated cross validation, model basic spline dengan RMSE terkecil dengan knot dengan df = 4 dan pada nilai MAE terkecil terletak pada model dengan df= 7.

Plot Basic Spline

Menentukan titik knots

df_4<-attr(bs(df$acceleration, df=4),"knots")
df_4
##  50% 
## 15.5
df_7<-attr(bs(df$acceleration, df=7),"knots")
df_7
##   20%   40%   60%   80% 
## 13.42 14.80 16.00 17.70

Membuat Plot

#Membuat Plot B spline df=4

pa_bs4 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, df=4), 
               lty = 1, se = F) +
  ggtitle("B-Spline (df=4)") +
  xlab("acceleration") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = c(15.1), col="grey50", lty=2)

#Membuat Plot B spline df=7


pa_bs7 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, df=7), 
               lty = 1, se = F) +
  ggtitle("B-Spline (df=7)") +
  xlab("acceleration") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = c(13.42, 14.80, 16.00, 17.70 ), col="grey50", lty=2)

ggarrange( pa_bs4, pa_bs7, ncol=2, nrow = 1)

Natural Spline

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")

dg <- 2:15

best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
            mod <- lm(mpg ~ ns(acceleration,df=i),
                     data=df[x$in_id,])
            pred <- predict(mod,
                           newdata=df[-x$in_id,])
            truth <- df[-x$in_id,]$mpg
            
            rmse <- mlr3measures::rmse(truth = truth,
                                       response = pred
                                       )
            mae <- mlr3measures::mae(truth = truth,
                                       response = pred
                                       )
            metric <- c(rmse,mae)
            names(metric) <- c("rmse","mae")
            return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)

Plot CV NS

Berdasarkan RMSE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Natural Spline

Menentukan titik knots

df_8a<-attr(ns(df$acceleration, df=8),"knots")
df_8a
##   12.5%     25%   37.5%     50%   62.5%     75%   87.5% 
## 12.5000 13.7750 14.5000 15.5000 16.2000 17.0250 18.7125

Membuat Plot

ggplot(Auto,aes(x=acceleration, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=8), 
               lty = 1, se = F) +
  ggtitle("Natural Splines (df=8)") +
  xlab("Acceleration") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = df_8a, col="grey50", lty=2)

Model Terbaik

Komparasi model terbaik dari pola hubungan antara mpg dan acceleration dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :

Model Polynomial memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model polynomial dengan derajat 4 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan acceleration dibandingkan dengan model lainnya.

D. MPG Vs Horsepower

Polynomial Regression

Secara Visual

g1 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") + 
  labs(title = "Polynomial Derajat = 1")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g2 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Derajat = 2")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Derajat = 3")+
  theme_bw() +
  theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Derajat = 4")+
  theme_bw() +
  theme(plot.title = element_text(size=9))


ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))

Secara visual, model polinomial derajat 3 dan 4 tampak dapat menggambarkan pola hubungan data mpg dan horsepower lebih baik dibandingkan dengan polinomial derajat lainnya.

Secara Empiris

train_control <- trainControl(method = "repeatedcv",
                              number = 10, repeats = 3)

cv_polynomdh <- function(df,i){
  f <- bquote(mpg ~ poly(horsepower, degree = .(i)))
  set.seed(581)
  model_poly <-  train(as.formula(f),             
                       data = df,                        
                       trControl = train_control,         
                       method = "lm",                      
                       na.action = na.pass)  
  predict <- predict(model_poly, df)
  metric_model_poly <- model_poly$resample[,-4]
  #poly <- apply(metric_model_poly,2,mean)
  return( metric_model_poly)
}

Plot CV

mae <- c()
rmse <- c()
 for ( i in 1:4){
 temp_er <- cv_polynomdh(df,i)
 rmse[i] <- mean(temp_er$RMSE)
 mae [i] <- mean(temp_er$MAE)
 }

#RMSE
ph <- data.frame(degree = 1:4, rmse) %>%
  mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
  ggplot(aes(x = degree, y = rmse)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_RMSE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 12, 0.2)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none",plot.title = element_text(size=10), plot.subtitle = element_text(size=8)) +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'horsepower' polynomial degree dengan RMSE",
       x = "Degree",
       y = "CV RMSE")

#MAE
pi <- data.frame(degree = 1:4, mae) %>%
  mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
  ggplot(aes(x = degree, y = mae)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_MAE))) +
  scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
  scale_y_continuous(breaks = seq(0, 10, 0.2)) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme_bw()+
  theme(legend.position = "none",plot.title = element_text(size=9),plot.subtitle = element_text(size=8) ) +
  labs(title = "Auto Dataset - Polynomial Regression",
       subtitle = "Memilih 'horsepower' polynomial degree dengan MAE",
       x = "Degree",
       y = "CV MAE")

ggarrange(ph, pi, ncol = 2, nrow =1)

Berdasarkan k fold cv, didapatkan Model Polynomial dengan derajat = 2 yang memiliki RMSE dan MAE terkecil.

Model Terbaik Polynomal

ggplot(df,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("horsepower") + 
  ylab("mile per gallon")

Picewise constant regression

Secar Visual

g2 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") + 
  labs(title = "Breaks= 2")+
   theme(plot.title = element_text(size=9))

g3 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") + 
  labs(title = "Breaks= 3")+
   theme(plot.title = element_text(size=9))

g4 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") + 
  labs(title = "Breaks= 4")+
   theme(plot.title = element_text(size=9))

g5 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") + 
  labs(title = "Breaks= 5")+
   theme(plot.title = element_text(size=9))

g6 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") + 
  labs(title = "Breaks= 6")+
   theme(plot.title = element_text(size=9))

g7 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") + 
  labs(title = "Breaks= 7")+
   theme(plot.title = element_text(size=9))

g8 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") + 
  labs(title = "Breaks= 8")+
   theme(plot.title = element_text(size=9))
   
g9 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") + 
  labs(title = "Breaks= 9")+
   theme(plot.title = element_text(size=9))

g10 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") + 
  labs(title = "Breaks= 10")+
   theme(plot.title = element_text(size=9))

grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)

Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 7 dan 8 yang lebih menggambarkan pola hubungan data antara mpg dan acceleration.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )


metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)

# menampilkan hasil all breaks
datatable(best_tangga)

Plot CV_Horsepower

#Plot PC_horsepower_Berdasarkan kriteria RMSE

pch2 <-best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_ac[which.min(best_tangga$rmse), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- RMSE Criteria") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error") +
  theme_bw ()

#Plot PC_Weight_Berdasarkan kriteria MAE 

 pch3 <- best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_tangga_ac[which.min(best_tangga$mae), ], color="#00CC99", 
               size=3) +
  ggtitle("K-Fold Cross Validation- MAE Criteria") +
  xlab("Breaks") + 
  ylab(" Mean Absolute Error") +
  theme_bw ()

Berdasarkan k fold cv, didapatkan Model Piecewise Constant dengan knot = 8 yang memiliki RMSE dan MAE terkecil.

Model Terbaik Piecewise Constant

ggplot(df,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8,raw=T), 
              lty = 1, col = "darkblue",se = F)+
  theme_bw()+
  ggtitle("Regresi Piecewise Constant K= 8") +
  xlab("Acceleration") + 
  ylab("mile per gallon")

Basic Spline

Secara Visual

g1 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") + 
  labs(title = "df = 4 (1 knots)")+ 
  theme_bw()

g2 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") + 
  labs(title = "df = 5 (2 knot)")+  theme_bw()

g3 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") + 
  labs(title = "df = 6 (3 knots)")+  theme_bw()

g4 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") + 
  labs(title = "df = 7 (4 knots)") +  theme_bw()

g5 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") + 
  labs(title = "df = 8 (5 knots)") +  theme_bw()

g6 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") + 
  labs(title = "df = 9 (6 knots)") +  theme_bw()

g7 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") + 
  labs(title = "df = 10 (7 knots)") +  theme_bw()

g8 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") + 
  labs(title = "df = 11 (8 knots)")  +  theme_bw()

g9 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") + 
  labs(title = "df = 12 (9 knots)") +  theme_bw()

g10 <- ggplot(df, aes(x = horsepower, y = mpg)) + 
  geom_point(alpha = 0.55, color= "#00CC99") + 
  geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") + 
  labs(title = "df = 13 (10 knots)") +  theme_bw()

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)

Secara visual, model basic spline dengan df = 5 lebih nampak mulus, dan ditengah maupun diujung ujung kurva tampak lebih teratur dan tidak liar. Model ini secara visual cukup baik menggambarkan pola hubungan data antara mpg dan acceleration.

Secara Empiris

set.seed(581)

cross_val <- vfold_cv(df,v=10,  repeats=5, strata = "mpg")

dg <- 2:10

best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
    function(x){
                mod <- lm(mpg ~ bs(horsepower,df=i),
                         data=df[x$in_id,])
                
                pred <- predict(mod,
                               newdata=df[-x$in_id,])
                
                truth <- df[-x$in_id,]$mpg
                
                rmse <- mlr3measures::rmse(truth = truth,
                                           response = pred
                                           )
                mae <- mlr3measures::mae(truth = truth,
                                           response = pred
                                           )
                metric <- c(rmse,mae)
                names(metric) <- c("rmse","mae")
                return(metric)
}
)

metric_bspline

# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)

mean_metric_bspline
}
)

best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)

Plot CV BS

Berdasarkan RMSE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Basic Spline

Menentukan titik knots

df_5h<-attr(bs(df$horsepower, df=5),"knots")
df_5h
## 33.33333% 66.66667% 
##        84       110

Membuat Plot

#Membuat Plot B spline df=5

 ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, df=5), 
               lty = 1, se = F) +
  ggtitle("B-Spline (df=5)") +
  xlab("horsepower") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = df_5h, col="grey50", lty=2)

Natural Spline

Secara Empiris

set.seed(123)

cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")

dg <- 2:20

best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
         data=df[x$in_id,])
pred <- predict(mod,
               newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)

Plot CV NS

Berdasarkan RMSE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99", 
               size=3) +
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Berdasarkan MAE

best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99", 
               size=3) +
  theme_bw() +
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error") +
  theme(plot.title = element_text(hjust=0.5, size=11))

Plot Natural Spline

Menentukan titik knots

df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7

Membuat Plot

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="#00CC99") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1, se = F) +
  ggtitle("Natural Spline (df=10)") +
  xlab("Horse Power") + ylab("mile per gallon")+
  theme_bw() +
  theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
  geom_vline(xintercept = df_10, col="grey50", lty=2)

Model Terbaik

Komparasi model terbaik dari pola hubungan antara mpg dan horsepower dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :

Model Polynomial memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model polynomial dengan derajat 2 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan horsepower dibandingkan dengan model lainnya.