Soal

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya:

  • Regrsi Polinomial

  • Piecewise Constant

  • Natural Cubic Splines

Lakukan hal di atas untuk masing-masing sub-set data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang).

  • Plot model terbaik dalam satu frame.

  • Berikan ulasan terhadap hasil Anda.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)
library(rsample)
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ISLR)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:mlr3measures':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
library(DT)
library(cowplot)
library(ggpmisc)
## Loading required package: ggpp
## 
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(tibble)

Eksplorasi Data

attach(Auto)
## The following object is masked from package:ggplot2:
## 
##     mpg
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
data= Auto

A <- ggplot(Auto,aes(x=horsepower,y=mpg),group = origin) +
  geom_point(aes(color = factor(origin)))+
  stat_smooth(aes(x=horsepower,y=mpg),method = "lm", 
              formula = y~x,lty = 1,
              col = "red",se = F)+
  theme_bw()+
  ggtitle("Scatter Plot Mpg dan Horse Power")+
  xlab("Horse Power")+
  ylab("Mpg") 

B <- ggplot(Auto, aes(x = horsepower, y = mpg, colour = factor(origin))) +
  geom_point() + stat_smooth(method = "lm", formula = y ~ x, se = FALSE)

plot_grid(A,B)

x1 = Auto$origin == "1"
data1 <- Auto[x1,]

x2 <- Auto$origin =="2"
data2 <- Auto[x2,]

x3 <- Auto$origin == "3"
data3 <- Auto[x3,]

a <-ggplot(data1, aes(x = horsepower, y = mpg),color="orange") +
  geom_point(color="orange") + 
  stat_smooth(method = "lm", formula = y ~ x, se = FALSE)

b <- ggplot(data2, aes(x = horsepower, y = mpg),color="deeppink4") +
  geom_point(color="deeppink4")+
  stat_smooth(method = "lm", formula = y ~ x, se = FALSE)

c <- ggplot(data3, aes(x = horsepower, y = mpg),color="green4")+
  geom_point(color="green4")+
  stat_smooth(method = "lm", formula = y ~ x, se = FALSE)

plot_grid(a,b,c,labels = c( "Amerika", "Eropa","Jepang"))

Data Seluruhnya

Regresi Polinomial

set.seed(039)
cv.poly <- function(data){
cv <- vfold_cv(data,v=5)
ordo <- 2:10
best_poly <- map_dfr(ordo, function(i) {
  metric_poly <- map_dfr(cv$splits,
                            function(x) {
                              train <- data[x$in_id,]
                              test <- data[-x$in_id,]
                              mod <- lm(mpg ~ poly(horsepower,ordo=i),data=train)
                              pred <- predict(mod, newdata=test)
                              rmse <- RMSE(pred,test$mpg)
                              mae <- MAE(pred,test$mpg)
                              R2 <- R2(pred,test$mpg)
                              metric <- c(rmse,mae,R2)
                              names(metric) <- c("rmse","mae","R2")
                              return(metric)
                              }
                            )
  metric_poly
  mean_metric_poly <- colMeans(metric_poly)
  mean_metric_poly
  }
)
best_poly <- cbind(ordo=ordo,best_poly)
}

best_poly <- cv.poly(Auto)
datatable(best_poly)
poly <- ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")
  
poly.2 <- ggplot(Auto,aes(x=horsepower, y=mpg),group=origin) + 
  geom_point(aes(color=factor(origin)),alpha=0.55) +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon") 

poly.7 <- ggplot(Auto,aes(x=horsepower, y=mpg),group=origin) + 
  geom_point(aes(color=factor(origin)),alpha=0.55) +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 7") +
  xlab("Horse Power") + 
  ylab("mile per gallon") 

poly

plot_grid(poly.2,poly.7, label_size = 6)

Fungsi Tangga(Picewise Constant)

set.seed(123)
cv.pc <- function(data){
cv <- vfold_cv(data,v=5)
breaks <- 2:10
cv_tangga <- map_dfr(breaks, function(i){
  metric_tangga <- map_dfr(cv$splits,
                           function(x){
                             training <- data[x$in_id,]
                             training$horsepower <- cut(training$horsepower,i)
                             mod <- lm(mpg ~ horsepower, data=training)
                             labs_x <- levels(mod$model[,2])
                             labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                    upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             testing <- data[-x$in_id,]
                             horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             pred <- predict(mod,newdata=list(horsepower=horsepower_new))
                             truth <- testing$mpg
                             data_eval <- na.omit(data.frame(truth,pred))
                             rmse <- rmse(truth = data_eval$truth,response = data_eval$pred)
                             mae <- mae(truth = data_eval$truth,response = data_eval$pred)
                             R2 <- rsq(truth = data_eval$truth,response = data_eval$pred)
                             metric <- c(rmse,mae,R2)
                             names(metric) <- c("rmse","mae","R2")
                             return(metric)
                             }
                           )

metric_tangga
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
cv_tangga <- cbind(breaks=breaks,cv_tangga)
}
cv_tangga <- cv.pc(Auto)
datatable(cv_tangga)
cv_pice_rmse <- ggplot(data=cv_tangga, aes(x=breaks, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks of Picewise Constant") + 
  ylab("Root Mean Square Error")

pice8 <- ggplot(Auto,aes(x=horsepower, y=mpg),group=origin) + 
  geom_point(aes(color=factor(origin)),alpha=0.55) +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi PC Breaks 8") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

plot_grid(cv_pice_rmse,pice8)

Natural Cubic Spline

set.seed(123)
cv.sp <- function(data){
cv <- vfold_cv(data,v=5)
df <- 2:10
cv_spline <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cv$splits,
                  function(x){
                    train <- data[x$in_id,]
                    test <- data[-x$in_id,]
                    mod <- lm(mpg ~ ns(horsepower,df=i),data=train)
                    pred <- predict(mod, newdata=test)
                    rmse <- RMSE(pred,test$mpg)
                    mae <- MAE(pred,test$mpg)
                    R2 <- R2(pred,test$mpg)
                    metric <- c(rmse,mae,R2)
                    names(metric) <- c("rmse","mae","R2")
                    return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)

cv_spline <- cbind(df=df,cv_spline)
}
cv_spline <- cv.sp(Auto)
datatable(cv_spline)
attr(ns(Auto$horsepower, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  68.85714  79.71429  89.57143  98.00000 113.57143 150.00000
set.seed(123)
cross.val <- vfold_cv(Auto,v=5)

knot10<- map_dfr(cross.val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,93,100,110,140,157)),
      data=Auto[x$in_id,])
      pred <- predict(mod,newdata=Auto[-x$in_id,])
      test <- Auto[-x$in_id,]
      rmse <- RMSE(pred,test$mpg)
      mae <- MAE(pred,test$mpg)
      R2 <- R2(pred,test$mpg)
      metric <- c(rmse,mae,R2)
      names(metric) <- c("rmse","mae","R2")
      return(metric)
}
)

mat_knot10 <- colMeans(knot10)
mat_knot10
##      rmse       mae        R2 
## 4.3030765 3.1982022 0.6991418
cv_spline_rmse <- ggplot(data=cv_spline, aes(x=df, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Df of Natural Cubic Spline ") + 
  ylab("Root Mean Square Error")

pspline <- ggplot(Auto,aes(x=horsepower, y=mpg),group=origin) + 
  geom_point(aes(color=factor(origin)),alpha=0.55) +
  stat_smooth(method = "lm", 
              formula = y~ns(x,knots = c(67,72,80,88,93,100,110,140,157)), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi NCS") +
  xlab("Horse Power") + 
  ylab("mile per gallon")+
  geom_vline(xintercept = c(67,72,80,88,93,100,110,140,157), col="grey50", lty=2)


plot_grid(cv_spline_rmse,pspline)

Perbandingan Model

plot_grid(poly.2,pice8,pspline)

ggplot(Auto,aes(x = horsepower, y = mpg)) +
  geom_point(color ="hotpink")+
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T),lty = 1,
              col = "red",se = F)+
  stat_smooth(method = "lm", 
              formula = y~cut(x,8,raw=T),lty = 1,
              col = "green",se = F)+
  stat_smooth(method = "lm",
              formula =y ~ ns(x,10),
              lty = 1,col = "black",se = F)+
  theme_bw()+
  ggtitle("Scatter Plot Mpg dan Horse Power")+
  xlab("Horse Power")+
  ylab("Mpg") 

nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
                      cv_tangga %>% select(-1) %>% slice_min(rmse),
                      cv_spline %>% select(-1) %>% slice_min(rmse))
nama_model <- c("Poly 2","Tangga 8","NSC(10)")
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)

Data Amerika

Regresi Polinomial

set.seed(355)
best_poly1 <- cv.poly(data1)
datatable(best_poly)
cv_poly1 <- ggplot(data=best_poly1, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

poly_data1<- ggplot(data1,aes(x=horsepower, y=mpg)) + 
  geom_point(color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  
plot_grid(cv_poly1, poly_data1, labels = c( "cv_poly1", "poly_data1"), label_size = 4)

Fungsi Tangga(Picewise Constant)

set.seed(0366)
cv_tangga1 <- cv.pc(data1)
datatable(cv_tangga1)
cv_pice1 <- ggplot(data=cv_tangga1, aes(x=breaks, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "orchid3")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks of Picewise Constant") + 
  ylab("Root Mean Square Error")

pice1 <- ggplot(data1,aes(x=horsepower, y=mpg)) + 
  geom_point(color="orchid2") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Picewise Constant Breaks 9") +
  xlab("Horsepower") + 
  ylab("mile per gallon") 

plot_grid(cv_pice1, pice1, labels = c("cv_pice1","pice1"), label_size = 4)

Natural Cubic Spline

set.seed(035)
cv_spline1 <- cv.sp(data1)
datatable(cv_spline)
cv_splin1 <- ggplot(data=cv_spline1, aes(x=df, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "deepskyblue1")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Df of Natural Cubic Spline ") + 
  ylab("Root Mean Square Error")

attr(ns(data1$horsepower, df=5),"knots")
##   20%   40%   60%   80% 
##  85.0  99.2 122.0 150.0
set.seed(123)
cross.val <- vfold_cv(data1,v=5)

knot5<- map_dfr(cross.val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower, knots = c(85,99.2,122,150)),
      data=Auto[x$in_id,])
      pred <- predict(mod,newdata=Auto[-x$in_id,])
      test <- Auto[-x$in_id,]
      rmse <- RMSE(pred,test$mpg)
      mae <- MAE(pred,test$mpg)
      R2 <- R2(pred,test$mpg)
      metric <- c(rmse,mae,R2)
      names(metric) <- c("rmse","mae","R2")
      return(metric)
}
)

pspline1 <- colMeans(knot5)

pspline1 <- ggplot(data1,aes(x=horsepower, y=mpg)) + 
  geom_point(color="deepskyblue1") +
  stat_smooth(method = "lm", 
              formula = y~ns(x,knots = c(85,99.2,122,150)), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi NCS Df 5") +
  xlab("Horse Power") + 
  ylab("mile per gallon")+ 
  geom_vline(xintercept = c(85,99.2,122,150), col="grey50", lty=2)

plot_grid(cv_splin1, pspline1, labels = c("CV spline","Plot Spline"), label_size = 4)

Perbandingan Model

plot_grid(poly_data1,pice1,pspline1)

ggplot(data1,aes(x = horsepower, y = mpg)) +
  geom_point(color ="hotpink")+
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T),lty = 1,
              col = "red",se = F)+
  stat_smooth(method = "lm", 
              formula = y~cut(x,9,raw=T),lty = 1,
              col = "green",se = F)+
  stat_smooth(method = "lm",
              formula =y ~ ns(x,5),
              lty = 1,col = "black",se = F)+
  theme_bw()+
  ggtitle("Scatter Plot Mpg dan Horse Power")+
  xlab("Horse Power")+
  ylab("Mpg") 

nilai_metric <- rbind(best_poly1 %>% select(-1) %>% slice_min(rmse),
                      cv_tangga1 %>% select(-1) %>% slice_min(rmse),
                      cv_spline1 %>% select(-1) %>% slice_min(rmse))
nama_model <- c("Poly 2","Tangga 9","NSC(5)")
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)

Data Eropa

Regresi Polinomial

set.seed(031)
best_poly2 <- cv.poly(data2)
datatable(best_poly2)
cv_poly2 <- ggplot(data=best_poly2, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

poly_data2<- ggplot(data2,aes(x=horsepower, y=mpg)) + 
  geom_point(color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

plot_grid(cv_poly2, poly_data2, labels = c( "cv_poly2", "poly_data2"), label_size = 4)

Fungsi Tangga(Picewise Constant)

Natural Cubic Spline

cv_splined <- cv.sp(data2)
datatable(cv_splined)
splin2 <- ggplot(data=cv_splined, aes(x=df, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "deepskyblue2")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Knot of Natural Cubic Spline ") + 
  ylab("Root Mean Square Error")

attr(ns(data2$horsepower, df=3),"knots")
## 33.33333% 66.66667% 
##        71        87
set.seed(123)
cross.val <- vfold_cv(data2,v=5)

knot3<- map_dfr(cross.val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower, knots = c(71,87)),
      data=Auto[x$in_id,])
      pred <- predict(mod,newdata=data2[-x$in_id,])
      test <- data2[-x$in_id,]
      rmse <- RMSE(pred,test$mpg)
      mae <- MAE(pred,test$mpg)
      R2 <- R2(pred,test$mpg)
      metric <- c(rmse,mae,R2)
      names(metric) <- c("rmse","mae","R2")
      return(metric)
}
)

pspline2 <- colMeans(knot3)

p_sline2 <- ggplot(data2,aes(x=horsepower, y=mpg)) + 
  geom_point(color="deepskyblue2") +
   stat_smooth(method = "lm", 
              formula = y~ns(x,knots = c(71,87)), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi NCS Df 3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")+ 
  geom_vline(xintercept = c(90,140), col="grey50", lty=2)

plot_grid(splin2,p_sline2)

Perbandingan Model

plot_grid(poly_data2,pice2,p_sline2)

ggplot(data2,aes(x = horsepower, y = mpg)) +
  geom_point(color ="hotpink")+
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T),lty = 1,
              col = "red",se = F)+
  stat_smooth(method = "lm", 
              formula = y~cut(x,4,raw=T),lty = 1,
              col = "green",se = F)+
  stat_smooth(method = "lm",
              formula =y ~ ns(x,3),
              lty = 1,col = "black",se = F)+
  theme_bw()+
  ggtitle("Scatter Plot Mpg dan Horse Power")+
  xlab("Horse Power")+
  ylab("Mpg") 

nilai_metric <- rbind(best_poly2 %>% select(-1) %>% slice_min(rmse),
                      cv_tangga2 %>% select(-1) %>% slice_min(rmse),
                      cv_splined %>% select(-1) %>% slice_min(rmse))
nama_model <- c("Poly 2","Tangga 4","NSC(3)")
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)

Data Jepang

Regresi Polinomial

set.seed(031)
best_poly3 <- cv.poly(data3)
datatable(best_poly3)
cv_poly3 <- ggplot(data=best_poly3, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "blue")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

poly_data3<- ggplot(data3,aes(x=horsepower, y=mpg)) + 
  geom_point(color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Ordo 3 ") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

plot_grid(cv_poly3, poly_data3, labels = c( "cv_poly3", "poly_data2"), label_size = 4)

Fungsi Tangga(Picewise Constant)

set.seed(039)
cv_tangga3 <- cv.pce(data3)
datatable(cv_tangga3)
cv_pice3 <- ggplot(data=cv_tangga3, aes(x=breaks, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "orchid4")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks of Picewise Constant") + 
  ylab("Root Mean Square Error")

pice3 <- ggplot(data3,aes(x=horsepower, y=mpg)) + 
  geom_point(color="orchid4") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,5,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Picewise Constant Breaks 5") +
  xlab("Horse Power") + 
  ylab("mile per gallon") 

plot_grid(cv_pice3, pice3, labels = c("cv_pice2","pice2"), label_size = 4)

Natural Cubic Spline

set.seed(039)
cv_spline3 <- cv.sp(data3)

cv_splin3 <- ggplot(data=cv_spline3, aes(x=df, y=rmse,group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(color = "deepskyblue1")+
  theme_bw()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Df of Natural Cubic Spline ") + 
  ylab("Root Mean Square Error")

attr(ns(data3$horsepower, df=3),"knots")
## 33.33333% 66.66667% 
##        68        90
knot3.3<- map_dfr(cross.val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower, knots = c(68,90)),
      data=Auto[x$in_id,])
      pred <- predict(mod,newdata=data2[-x$in_id,])
      test <- data2[-x$in_id,]
      rmse <- RMSE(pred,test$mpg)
      mae <- MAE(pred,test$mpg)
      R2 <- R2(pred,test$mpg)
      metric <- c(rmse,mae,R2)
      names(metric) <- c("rmse","mae","R2")
      return(metric)
}
)

pspline2 <- colMeans(knot3.3)

psline3 <- ggplot(data3,aes(x=horsepower, y=mpg)) + 
  geom_point(color="deepskyblue1") +
  stat_smooth(method = "lm", 
              formula = y~ns(x,3), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi NCS Knot 3") +
  xlab("Horse Power 3") + 
  ylab("mile per gallon")+ 
  geom_vline(xintercept = c(68,90), col="grey50", lty=2)

plot_grid(cv_splin3, psline3, labels = c("CV spline","Plot Spline"), label_size = 4)

Perbandingan Model

plot_grid(poly_data3,pice3,psline3)

ggplot(data3,aes(x = horsepower, y = mpg)) +
  geom_point(color ="hotpink")+
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T),lty = 1,
              col = "red",se = F)+
  stat_smooth(method = "lm", 
              formula = y~cut(x,5,raw=T),lty = 1,
              col = "green",se = F)+
  stat_smooth(method = "lm",
              formula =y ~ ns(x,3),
              lty = 1,col = "black",se = F)+
  theme_bw()+
  ggtitle("Scatter Plot Mpg dan Horse Power")+
  xlab("Horse Power")+
  ylab("Mpg") 

nilai_metric <- rbind(best_poly3 %>% select(-1) %>% slice_min(rmse),
                      cv_tangga3 %>% select(-1) %>% slice_min(rmse),
                      cv_spline3 %>% select(-1) %>% slice_min(rmse))
nama_model <- c("Poly 3","Tangga 5","NCS(3)")
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)

Model Terbaik

nilai_metric <- rbind(cv_spline%>% select(-1) %>% slice_min(rmse),
                      cv_spline1%>% select(-1) %>% slice_min(rmse),
                      best_poly2 %>% select(-1) %>% slice_min(rmse),
                      best_poly3 %>% select(-1) %>% slice_min(rmse))
nama_model <- c("Seluruh dataset (NCS 10)","Amerika (NCS 5)","Eropa (Poly 2)","Jepang (Poly 3)")
best_model <- data.frame(nama_model,nilai_metric)
datatable(best_model)
plot_grid(pspline,pspline1,poly_data2,poly_data3)