library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(splines)
library(rsample)
library(ISLR)
library(ggplot2)
attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

MPG vs Weight

Auto1 <- data.frame(cbind(mpg, weight))
ggplot(Auto1,aes(x=weight, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

mod_linear = lm(mpg~weight,data=Auto1)
summary(mod_linear)

## 
## Call:
## lm(formula = mpg ~ weight, data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9736  -2.7556  -0.3358   2.1379  16.5194 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.216524   0.798673   57.87   <2e-16 ***
## weight      -0.007647   0.000258  -29.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.333 on 390 degrees of freedom
## Multiple R-squared:  0.6926, Adjusted R-squared:  0.6918 
## F-statistic: 878.8 on 1 and 390 DF,  p-value: < 2.2e-16

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")

breaks <- 2:10

best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(weight,i),
         data=Auto1[x$in_id,])

pred <- predict(mod,
               newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly

# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)

mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly

##   breaks     rmse      mae
## 1      2 4.149977 3.060704
## 2      3 4.158482 3.068020
## 3      4 4.165585 3.067210
## 4      5 4.169760 3.079497
## 5      6 4.178027 3.098482
## 6      7 4.170375 3.100742
## 7      8 4.181430 3.108979
## 8      9 4.254235 3.164729
## 9     10 4.299992 3.167556

best_poly%>% slice_min(rmse)

##   breaks     rmse      mae
## 1      2 4.149977 3.060704

best_poly%>% slice_min(mae)

##   breaks     rmse      mae
## 1      2 4.149977 3.060704

Model Terbaik Polinomial Ordo 2

mod_polinomial=lm(mpg~poly(weight, 2, raw=T), data = Auto1)
mod_polinomial = lm(Auto1$mpg~ poly(Auto$weight,2,raw = T),data=Auto1)

Auto1best<-ggplot(Auto,aes(x=Auto1$weight, y=Auto1$mpg)) + 
  ggtitle("Polinomial Ordo 2")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+ theme_bw()
Auto1best

Regresi Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto1[x$in_id,]
training$weight <- cut(training$weight,i)

mod <- lm(mpg ~ weight,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto1[-x$in_id,]

weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(weight=weight_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga

mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)

best_tangga

##   breaks     rmse      mae
## 1      3 4.868847 3.657505
## 2      4 4.706593 3.633584
## 3      5 4.389515 3.221556
## 4      6 4.201264 3.112221
## 5      7 4.279395 3.191726
## 6      8 4.370964 3.249765
## 7      9 4.341743 3.182396
## 8     10 4.329114 3.176390

best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      6 4.201264 3.112221

best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1      6 4.201264 3.112221

Model Terbaik Fungsi Tangga Knot 5

mod_tangga = lm(mpg ~ cut(weight,5),data=Auto1)
summary(mod_tangga)

## 
## Call:
## lm(formula = mpg ~ cut(weight, 5), data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3101  -2.3101  -0.1055   1.8971  18.2945 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        31.3101     0.4048   77.34   <2e-16 ***
## cut(weight, 5)(2.32e+03,3.02e+03]  -6.2046     0.5841  -10.62   <2e-16 ***
## cut(weight, 5)(3.02e+03,3.73e+03] -12.2930     0.6485  -18.96   <2e-16 ***
## cut(weight, 5)(3.73e+03,4.43e+03] -16.2148     0.6881  -23.56   <2e-16 ***
## cut(weight, 5)(4.43e+03,5.14e+03] -18.5184     0.9882  -18.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.416 on 387 degrees of freedom
## Multiple R-squared:  0.6831, Adjusted R-squared:  0.6798 
## F-statistic: 208.6 on 4 and 387 DF,  p-value: < 2.2e-16

Auto1abest<- ggplot(Auto1,aes(x=mpg, y=weight)) +
                  ggtitle("Fungsi Tangga Knot 5")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()
Auto1abest

Regresi Spline Df = 6:12

set.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")

df <- 6:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(weight,df=i),
         data=Auto1[x$in_id,])
pred <- predict(mod,
               newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2227.5, `50%` = 2792.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2234, `50%` = 2835, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2155, `40%` = 2577.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2166.8, `40%` = 2604, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2125, `33.33333%` =
## 2391, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2130, `33.33333%` =
## 2406, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2076.42857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2102.28571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2049.375, `25%` =
## 2227.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2051, `25%` = 2234, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2020, `22.22222%` =
## 2190, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2021.66666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1995.5, `20%` = 2155, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1996, `20%` = 2166.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  6 4.169320 3.097698
## 2  7 4.182779 3.098524
## 3  8 4.182940 3.120736
## 4  9 4.140723 3.065880
## 5 10 4.156925 3.096623
## 6 11 4.168753 3.095133
## 7 12 4.172423 3.093106

best_spline3 %>% slice_min(rmse)

##   df     rmse     mae
## 1  9 4.140723 3.06588

best_spline3 %>% slice_min(mae)

##   df     rmse     mae
## 1  9 4.140723 3.06588

Model Terbaik Regresi Spline Df=9

mod_spline3 = lm(mpg ~ ns(weight, df=9), data=Auto1)
summary(mod_spline3)

## 
## Call:
## lm(formula = mpg ~ ns(weight, df = 9), data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3376  -2.5092  -0.4385   1.8081  16.4843 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           32.211      2.345  13.738  < 2e-16 ***
## ns(weight, df = 9)1   -2.185      2.242  -0.975  0.33028    
## ns(weight, df = 9)2   -7.507      2.936  -2.557  0.01094 *  
## ns(weight, df = 9)3   -5.201      2.594  -2.006  0.04561 *  
## ns(weight, df = 9)4  -11.233      2.766  -4.060 5.95e-05 ***
## ns(weight, df = 9)5  -12.573      2.663  -4.721 3.29e-06 ***
## ns(weight, df = 9)6  -15.798      2.687  -5.881 8.93e-09 ***
## ns(weight, df = 9)7  -19.209      1.932  -9.944  < 2e-16 ***
## ns(weight, df = 9)8  -17.570      5.328  -3.298  0.00107 ** 
## ns(weight, df = 9)9  -22.259      2.202 -10.107  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.159 on 382 degrees of freedom
## Multiple R-squared:  0.7226, Adjusted R-squared:  0.7161 
## F-statistic: 110.6 on 9 and 382 DF,  p-value: < 2.2e-16

Auto1bbest <-ggplot(Auto1,aes(x=weight, y=mpg)) +
                  ggtitle("Regresi Spline df=9")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=9), 
               lty = 1,se = F)
Auto1bbest

Model Terbaik MPG vs Weight

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(Auto1best, Auto1abest, Auto1bbest, ncol=2)

Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 2.

MPG vs Acceleration

Auto2 <- data.frame(cbind(mpg, acceleration))
ggplot(Auto2,aes(x=acceleration, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

mod_linear = lm(mpg~acceleration,data=Auto2)
summary(mod_linear)

## 
## Call:
## lm(formula = mpg ~ acceleration, data = Auto2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.989  -5.616  -1.199   4.801  23.239 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.8332     2.0485   2.359   0.0188 *  
## acceleration   1.1976     0.1298   9.228   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.08 on 390 degrees of freedom
## Multiple R-squared:  0.1792, Adjusted R-squared:  0.1771 
## F-statistic: 85.15 on 1 and 390 DF,  p-value: < 2.2e-16

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")

breaks <- 2:10

best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(acceleration,i),
         data=Auto2[x$in_id,])

pred <- predict(mod,
               newdata=Auto2[-x$in_id,])
truth <- Auto2[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly

# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)

mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly

##   breaks     rmse      mae
## 1      2 7.037689 5.755305
## 2      3 7.048668 5.792450
## 3      4 7.008647 5.700795
## 4      5 7.042178 5.729396
## 5      6 7.046243 5.704385
## 6      7 7.097228 5.711475
## 7      8 7.167747 5.767367
## 8      9 7.150072 5.744992
## 9     10 7.164808 5.754932

best_poly%>% slice_min(rmse)

##   breaks     rmse      mae
## 1      4 7.008647 5.700795

best_poly%>% slice_min(mae)

##   breaks     rmse      mae
## 1      4 7.008647 5.700795

Model Terbaik Polinomial Ordo 4

mod_polinomial=lm(mpg~poly(acceleration, 4, raw=T), data = Auto2)
mod_polinomial = lm(Auto1$mpg~ poly(Auto$acceleration,4,raw = T),data=Auto2)

Auto2best<-ggplot(Auto2,aes(x=Auto2$acceleration, y=Auto2$mpg)) + 
  ggtitle("Polinomial Ordo 4")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+ theme_bw()
Auto2best

## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.

## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.

## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.

## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.

Regresi Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto2[x$in_id,]
training$acceleration <- cut(training$acceleration,i)

mod <- lm(mpg ~ acceleration,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto2[-x$in_id,]

acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(acceleration=acceleration_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga

mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)

best_tangga

##   breaks     rmse      mae
## 1      3 7.003365 5.637141
## 2      4 7.243047 5.879089
## 3      5 7.240624 5.948255
## 4      6 6.987921 5.640505
## 5      7 7.202002 5.846671
## 6      8 7.018944 5.692063
## 7      9 7.012388 5.626494
## 8     10 7.145911 5.803971

best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      6 6.987921 5.640505

best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1      9 7.012388 5.626494

Model Terbaik Fungsi Tangga Knot 5 dan 8

mod_tangga = lm(mpg ~ cut(acceleration,5),data=Auto2)
summary(mod_tangga)

## 
## Call:
## lm(formula = mpg ~ cut(acceleration, 5), data = Auto2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.637  -5.879  -1.186   5.155  23.721 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       15.186      1.564   9.708  < 2e-16 ***
## cut(acceleration, 5)(11.4,14.7]    5.694      1.682   3.386 0.000782 ***
## cut(acceleration, 5)(14.7,18.1]   10.057      1.659   6.064 3.17e-09 ***
## cut(acceleration, 5)(18.1,21.4]   10.452      1.834   5.698 2.41e-08 ***
## cut(acceleration, 5)(21.4,24.8]   16.760      2.668   6.282 9.02e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.169 on 387 degrees of freedom
## Multiple R-squared:  0.1651, Adjusted R-squared:  0.1564 
## F-statistic: 19.13 on 4 and 387 DF,  p-value: 2.277e-14

Auto2abest<- ggplot(Auto2,aes(x=mpg, y=acceleration)) +
                  ggtitle("Fungsi Tangga Knot 5")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()
Auto2abest

mod_tangga = lm(mpg ~ cut(acceleration,8),data=Auto2)
summary(mod_tangga)

## 
## Call:
## lm(formula = mpg ~ cut(acceleration, 8), data = Auto2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.4630  -5.2518  -0.7662   4.5657  24.6519 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       14.600      2.198   6.642 1.06e-10 ***
## cut(acceleration, 8)(10.1,12.2]    1.712      2.500   0.685   0.4940    
## cut(acceleration, 8)(12.2,14.3]    5.348      2.333   2.292   0.0224 *  
## cut(acceleration, 8)(14.3,16.4]   10.548      2.279   4.628 5.05e-06 ***
## cut(acceleration, 8)(16.4,18.5]   10.863      2.330   4.663 4.31e-06 ***
## cut(acceleration, 8)(18.5,20.6]   12.184      2.470   4.932 1.21e-06 ***
## cut(acceleration, 8)(20.6,22.7]   12.454      2.924   4.260 2.58e-05 ***
## cut(acceleration, 8)(22.7,24.8]   19.800      4.112   4.815 2.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.951 on 384 degrees of freedom
## Multiple R-squared:  0.2211, Adjusted R-squared:  0.2069 
## F-statistic: 15.57 on 7 and 384 DF,  p-value: < 2.2e-16

Auto2bbest<- ggplot(Auto2,aes(x=mpg, y=acceleration)) +
                  ggtitle("Fungsi Tangga Knot 8")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()
Auto2bbest

Regresi Spline Df = 6:12

set.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")

df <- 6:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(acceleration,df=i),
         data=Auto2[x$in_id,])
pred <- predict(mod,
               newdata=Auto2[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

## Warning in bs(acceleration, degree = 3L, knots = c(`25%` = 13.7, `50%` = 15.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`25%` = 13.8, `50%` = 15.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`20%` = 13.42, `40%` =
## 14.8, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`20%` = 13.5, `40%` = 14.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`16.66667%` = 13, `33.33333%`
## = 14.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`16.66667%` = 13, `33.33333%`
## = 14.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`14.28571%` = 12.8,
## `28.57143%` = 14, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`14.28571%` = 12.8,
## `28.57143%` = 14, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`12.5%` = 12.5, `25%` =
## 13.7, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`12.5%` = 12.5, `25%` =
## 13.8, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`11.11111%` = 12.2,
## `22.22222%` = 13.5, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`11.11111%` = 12.5,
## `22.22222%` = 13.5, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`10%` = 12, `20%` = 13.42, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(acceleration, degree = 3L, knots = c(`10%` = 12.02, `20%` =
## 13.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  6 7.027505 5.700351
## 2  7 7.014061 5.636830
## 3  8 7.039149 5.664311
## 4  9 7.044218 5.686154
## 5 10 7.019215 5.651353
## 6 11 7.075200 5.697487
## 7 12 7.061283 5.700106

best_spline3 %>% slice_min(rmse)

##   df     rmse     mae
## 1  7 7.014061 5.63683

best_spline3 %>% slice_min(mae)

##   df     rmse     mae
## 1  7 7.014061 5.63683

Model Terbaik Regresi Spline Df=7

mod_spline3 = lm(mpg ~ ns(acceleration, df=7), data=Auto2)
summary(mod_spline3)

## 
## Call:
## lm(formula = mpg ~ ns(acceleration, df = 7), data = Auto2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.0600  -4.9927  -0.6968   4.8321  23.3231 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 14.450      3.530   4.094 5.18e-05 ***
## ns(acceleration, df = 7)1    6.727      3.410   1.973  0.04923 *  
## ns(acceleration, df = 7)2   13.110      4.114   3.187  0.00156 ** 
## ns(acceleration, df = 7)3    7.781      3.793   2.051  0.04091 *  
## ns(acceleration, df = 7)4   13.760      3.900   3.528  0.00047 ***
## ns(acceleration, df = 7)5    7.985      3.033   2.633  0.00882 ** 
## ns(acceleration, df = 7)6   15.990      8.223   1.945  0.05254 .  
## ns(acceleration, df = 7)7   21.223      3.944   5.381 1.29e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.915 on 384 degrees of freedom
## Multiple R-squared:  0.2292, Adjusted R-squared:  0.2151 
## F-statistic: 16.31 on 7 and 384 DF,  p-value: < 2.2e-16

Auto2cbest <-ggplot(Auto2,aes(x=acceleration, y=mpg)) +
                  ggtitle("Regresi Spline df=7")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=7), 
               lty = 1,se = F)
Auto2cbest

Model Terbaik MPG vs Acceleration

library(gridExtra)
grid.arrange(Auto2best, Auto2abest, Auto2bbest, Auto2cbest)

## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.

## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.

## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.

## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.

Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 4.

MPG vs Year

Auto3 <- data.frame(cbind(mpg, year))
ggplot(Auto3,aes(x=year, y=mpg)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

mod_linear = lm(mpg~year,data=Auto3)
summary(mod_linear)

## 
## Call:
## lm(formula = mpg ~ year, data = Auto3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0212  -5.4411  -0.4412   4.9739  18.2088 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -70.01167    6.64516  -10.54   <2e-16 ***
## year          1.23004    0.08736   14.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.363 on 390 degrees of freedom
## Multiple R-squared:  0.337,  Adjusted R-squared:  0.3353 
## F-statistic: 198.3 on 1 and 390 DF,  p-value: < 2.2e-16

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")

breaks <- 2:10

best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(year,i),
         data=Auto3[x$in_id,])

pred <- predict(mod,
               newdata=Auto3[-x$in_id,])
truth <- Auto3[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly

# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)

mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly

##   breaks     rmse      mae
## 1      2 6.222687 5.271367
## 2      3 6.230273 5.290035
## 3      4 6.226953 5.261184
## 4      5 6.246170 5.271508
## 5      6 6.236244 5.251166
## 6      7 6.177974 5.211158
## 7      8 6.191036 5.210099
## 8      9 6.150086 5.210250
## 9     10 6.081733 5.203132

best_poly%>% slice_min(rmse)

##   breaks     rmse      mae
## 1     10 6.081733 5.203132

best_poly%>% slice_min(mae)

##   breaks     rmse      mae
## 1     10 6.081733 5.203132

Model Terbaik Polinomial Ordo 10

mod_polinomial=lm(mpg~poly(year, 10, raw=T), data = Auto3)
mod_polinomial = lm(Auto3$mpg~ poly(Auto3$year,10,raw = T),data=Auto3)

Auto3best<-ggplot(Auto3,aes(x=Auto3$year, y=Auto3$mpg)) + 
  ggtitle("Polinomial Ordo 10")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,10,raw=T), 
               lty = 1, col = "blue",se = F)+ theme_bw()
Auto3best

## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.

## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.

## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.

## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Regresi Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto3[x$in_id,]
training$year <- cut(training$year,i)

mod <- lm(mpg ~ year,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- Auto3[-x$in_id,]

year_new <- cut(testing$year,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(year=year_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga

mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)

best_tangga

##   breaks     rmse      mae
## 1      3 6.436057 5.346212
## 2      4 6.116266 5.159484
## 3      5 6.215227 5.233387
## 4      6 6.430025 5.386850
## 5      7 6.361801 5.339819
## 6      8 6.121254 5.204424
## 7      9 6.214099 5.259250
## 8     10 6.143947 5.216197

best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      4 6.116266 5.159484

best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1      4 6.116266 5.159484

Model Terbaik Fungsi Tangga Knot 3

mod_tangga = lm(mpg ~ cut(year,3),data=Auto3)
summary(mod_tangga)

## 
## Call:
## lm(formula = mpg ~ cut(year, 3), data = Auto3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14.72  -5.22  -1.04   4.78  20.74 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          19.2200     0.5196  36.993  < 2e-16 ***
## cut(year, 3)(74,78]   3.1409     0.7657   4.102 4.99e-05 ***
## cut(year, 3)(78,82]  11.0046     0.7907  13.918  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.363 on 389 degrees of freedom
## Multiple R-squared:  0.3387, Adjusted R-squared:  0.3353 
## F-statistic: 99.62 on 2 and 389 DF,  p-value: < 2.2e-16

Auto3abest<- ggplot(Auto3,aes(x=mpg, y=year)) +
                  ggtitle("Fungsi Tangga Knot 3")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,3), 
               lty = 1, col = "blue",se = F)+
  theme_bw()
Auto3abest

Regresi Spline Df = 6:12

set.seed(123)
cross_val <- vfold_cv(Auto3,v=10,strata = "mpg")

df <- 6:12

best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ bs(year,df=i),
         data=Auto3[x$in_id,])
pred <- predict(mod,
               newdata=Auto3[-x$in_id,])
truth <- Auto3[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)

mean_metric_spline3
}
)

best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3

##   df     rmse      mae
## 1  6 6.227589 5.238634
## 2  7 6.214179 5.240899
## 3  8 6.199109 5.227936
## 4  9 6.131527 5.201834
## 5 10 6.087050 5.206879
## 6 11 6.083599 5.195066
## 7 12 6.123817 5.226678

best_spline3 %>% slice_min(rmse)

##   df     rmse      mae
## 1 11 6.083599 5.195066

best_spline3 %>% slice_min(mae)

##   df     rmse      mae
## 1 11 6.083599 5.195066

Model Terbaik Regresi Spline Df=11

mod_spline3 = lm(mpg ~ ns(year, df=11), data=Auto3)
summary(mod_spline3)

## 
## Call:
## lm(formula = mpg ~ ns(year, df = 11), data = Auto3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6834  -4.7129  -0.9992   4.2887  19.1824 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          17.6895     1.1104  15.931  < 2e-16 ***
## ns(year, df = 11)1    1.1100     2.3042   0.482 0.630259    
## ns(year, df = 11)2   -3.1654     2.1301  -1.486 0.138097    
## ns(year, df = 11)3    8.0610     2.3655   3.408 0.000725 ***
## ns(year, df = 11)4    0.5022     2.3002   0.218 0.827291    
## ns(year, df = 11)5    5.4459     2.4630   2.211 0.027624 *  
## ns(year, df = 11)6    7.2107     2.3021   3.132 0.001869 ** 
## ns(year, df = 11)7    4.4140     2.2389   1.972 0.049388 *  
## ns(year, df = 11)8   20.5266     2.3691   8.664  < 2e-16 ***
## ns(year, df = 11)9    8.6745     2.1079   4.115 4.75e-05 ***
## ns(year, df = 11)10  16.9914     2.9507   5.758 1.75e-08 ***
## ns(year, df = 11)11  11.5737     1.5679   7.382 9.96e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.98 on 380 degrees of freedom
## Multiple R-squared:  0.4296, Adjusted R-squared:  0.413 
## F-statistic: 26.01 on 11 and 380 DF,  p-value: < 2.2e-16

Auto3bbest <-ggplot(Auto3,aes(x=year, y=mpg)) +
                  ggtitle("Regresi Spline df=11")+
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~ns(x, df=11), 
               lty = 1,se = F)
Auto3bbest

Model Terbaik MPG vs Year

library(gridExtra)
grid.arrange(Auto3best, Auto3abest, Auto3bbest, ncol=2)

## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.

## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.

## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.

## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 10.

Pemodelan Pola Hubungan Tak Linier

MPG vs Weight

Regresi Polinomial

Model Terbaik Polinomial Ordo 2

Regresi Fungsi Tangga

Model Terbaik Fungsi Tangga Knot 5

Regresi Spline Df = 6:12

Model Terbaik Regresi Spline Df=9

Model Terbaik MPG vs Weight

MPG vs Acceleration

Regresi Polinomial

Model Terbaik Polinomial Ordo 4

Regresi Fungsi Tangga

Model Terbaik Fungsi Tangga Knot 5 dan 8

Regresi Spline Df = 6:12

Model Terbaik Regresi Spline Df=7

Model Terbaik MPG vs Acceleration

MPG vs Year

Regresi Polinomial

Model Terbaik Polinomial Ordo 10

Regresi Fungsi Tangga

Model Terbaik Fungsi Tangga Knot 3

Regresi Spline Df = 6:12

Model Terbaik Regresi Spline Df=11

Model Terbaik MPG vs Year