MOVING BEYOND LINEARITY

SOAL

Data bisa diperoleh untuk tugas dapat diperoleh di package ISLR.Silahkan Install dulu packagenya.

Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.

  1. Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.

  2. Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)

DATA

Data yang digunakan adalah data Auto dari Package ISLR. Data terdiri dari 9 variabel dan 392 observasi atau pengamatan.Pada kali ini variabel yang digunakan ada 4 variabel, yaitu:

mpg : mile per gallon (Y)

horsepower : Kekuatan mesin (X1)

dispacement : Perpindahan (X2)

weight : Berat (X3)

datatable(Auto) 
Auto1 <- Auto %>% select(mpg,horsepower,displacement,weight)
datatable(Auto1)
par(mfrow=c(1,1))

hp <- plot(Auto1$horsepower, Auto$mpg,pch=19, col="brown",
     xlab="Horse Power", ylab="Mile per Gallon")

dp <- plot(Auto1$displacement, Auto$mpg,pch=19, col="brown",
     xlab="Displacement", ylab="Mile per Gallon")

w <- plot(Auto1$weight, Auto$mpg,pch=19, col="brown",
     xlab="Weight", ylab="Mile per Gallon")

Scatter plot terdiri dari 392 kendaraan dimana masing-masing kendaraan diukur empat peubah/variabel, yaitu horse power (X1), displacement (X2), weight (X3) dan mile per gallon (Y). Jika diamati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, maka ada beberapa metode yang bisa digunakan. Metode yang akan digunakan pada kali ini ada 3 yaitu Regresi Polinomial, Piecewise constant, dan Natural cubic splines.

mpg (Y) vs horsepower (X1)

1. Regresi Polinomial

cv_poly1 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(horsepower,1,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1) 
return(poly1) 
}

cv_poly2 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(horsepower,2,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2) 
return(poly2) 
}

cv_poly3 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(horsepower,3,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3) 
return(poly3) 
}

cv_poly4 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(horsepower,4,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4) 
return(poly4) 
}
cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly
##          rmse
## [1,] 4.909546
## [2,] 4.353575
## [3,] 4.352157
## [4,] 4.363377
best.poly <- min(cv_poly)
best.poly
## [1] 4.352157

Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 3 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 3.

poly.1 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.2 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.4 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)

2. Piecewise Constant

cv_pc <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  breaks <- 3:12
  best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto1[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
                           upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1", 
                           labs_x)))
      testing <- Auto1[-x$in_id,]
      horsepower_new<-cut(testing$horsepower,c(labs_x_breaks[1,1],
                                               labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
      metric <- c(rmse)
      names(metric) <- c("rmse")
  return(metric)
 }
)
    
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
  
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}
cv_pc(Auto1)
##   breaks     rmse
## 1      8 4.408006

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8.

ggplot(Auto1,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="brown") +
  stat_smooth(method = "lm",formula = y~cut(x,8),lty = 1, col = "blue",se = F) +
  theme_bw()

Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 8. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 8 membentuk pola dan mengikuti sebaran data.

3. Natural cubic splines

Penentuan banyaknya basis

set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 1:8
best.spline3 <- map_dfr(df, function(i){
  metric.spline3 <- map_dfr(cross.val$splits,
  function(x){
  mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto1[x$in_id,])
  pred <- predict(mod,newdata=Auto1[-x$in_id,])
  truth <- Auto1[-x$in_id,]$mpg
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  metric <- c(rmse)
  names(metric) <- c("rmse")
  return(metric)
  }
)
metric.spline3
    
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3) 
mean.metric.spline3
})

best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse))) 
basis
##   df    rmse
## 1  7 4.27459

Menentukan knot berdasarkan basis terpilih yaitu 7

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$horsepower, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  68.85714  79.71429  89.57143  98.00000 113.57143 150.00000

Menentukan knot secara terencana yaitu 4

#nilai knots yang tererncana
attr(ns(Auto1$horsepower, df=4),"knots")
##   25%   50%   75% 
##  75.0  93.5 126.0
# Knot dari basis terpilih
mod_spline3ns = lm(mpg ~ ns(horsepower, knots =                         
                c(68.85714,79.71429,89.57143,98,113.57143,150)),data=Auto1)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(68.85714, 79.71429, 
##     89.57143, 98, 113.57143, 150)), data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6158  -2.5868  -0.1339   2.2474  14.4829 
## 
## Coefficients:
##                                                                              Estimate
## (Intercept)                                                                    33.748
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1   -7.888
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2   -7.844
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3  -14.226
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4  -12.527
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5  -24.049
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6  -19.387
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7  -20.803
##                                                                              Std. Error
## (Intercept)                                                                       1.563
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1      1.694
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2      1.923
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3      1.812
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4      2.077
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5      1.715
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6      3.664
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7      1.761
##                                                                              t value
## (Intercept)                                                                   21.592
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1  -4.657
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2  -4.078
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3  -7.853
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4  -6.032
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 -14.021
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6  -5.291
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 -11.816
##                                                                              Pr(>|t|)
## (Intercept)                                                                   < 2e-16
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 4.43e-06
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 5.52e-05
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 4.11e-14
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 3.81e-09
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5  < 2e-16
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 2.05e-07
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7  < 2e-16
##                                                                                 
## (Intercept)                                                                  ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.272 on 384 degrees of freedom
## Multiple R-squared:  0.7057, Adjusted R-squared:  0.7004 
## F-statistic: 131.6 on 7 and 384 DF,  p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(horsepower, knots = c(75,93.5,126)),data=Auto1)
summary(mod_spline3ns1)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(75, 93.5, 126)), 
##     data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5781  -2.4975  -0.2753   2.0543  15.7782 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                 36.019      1.326  27.167   <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))1  -15.110      1.308 -11.554   <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))2  -20.243      1.290 -15.692   <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))3  -26.848      3.019  -8.892   <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))4  -21.761      1.569 -13.874   <2e-16
##                                              
## (Intercept)                               ***
## ns(horsepower, knots = c(75, 93.5, 126))1 ***
## ns(horsepower, knots = c(75, 93.5, 126))2 ***
## ns(horsepower, knots = c(75, 93.5, 126))3 ***
## ns(horsepower, knots = c(75, 93.5, 126))4 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.328 on 387 degrees of freedom
## Multiple R-squared:  0.6957, Adjusted R-squared:  0.6925 
## F-statistic: 221.2 on 4 and 387 DF,  p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 7:12
best_spline3 <- map_dfr(df, function(i){
  metric_spline3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg~ ns(horsepower,df=i),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
  }
)
metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})
# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3
##   df     rmse
## 1  7 4.274590
## 2  8 4.289782
## 3  9 4.292602
## 4 10 4.273561
## 5 11 4.298625
## 6 12 4.296527
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)
##   df     rmse
## 1 10 4.273561

Secara empiris berdasarkan rmse terbaik dengan df=10

hp6 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =                             c(68.85714,79.71429,89.57143,98,113.57143,150)),lty = 1,se=F) +
              theme_bw()

hp3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =  
              c(75,93.5,126)),lty = 1,se=F) + theme_bw()

plot_grid(hp6,hp3)

Dari visual yang terbaik adalah horsepower dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.

Perbandingan Model

perbandinganAutohp <- rbind(best.poly, cv_pc(Auto1)[1,-1],
                      best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutohp$metode <- c("Polinomial ordo 3", "Piecewise Constant (breaks=8)", "Natural Cubic Splines 3 Knot (df=10)")
perbandinganAutohp
##       rmse                               metode
## 1 4.352157                    Polinomial ordo 3
## 2 4.408006        Piecewise Constant (breaks=8)
## 3 4.273561 Natural Cubic Splines 3 Knot (df=10)

Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=10).

mpg (Y) vs displacement (X2)

1. Regresi Polinomial

cv_poly1 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(displacement,1,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1) 
return(poly1) 
}

cv_poly2 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(displacement,2,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2) 
return(poly2) 
}

cv_poly3 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(displacement,3,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3) 
return(poly3) 
}

cv_poly4 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(displacement,4,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4) 
return(poly4) 
}
cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly
##          rmse
## [1,] 4.597766
## [2,] 4.325171
## [3,] 4.327439
## [4,] 4.337798
best.poly <- min(cv_poly)
best.poly
## [1] 4.325171

Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 2 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 2.

poly.1 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.2 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.4 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)

2. Piecewise Constant

cv_pc <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  breaks <- 3:12
  best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto1[x$in_id,]
      training$displacement <- cut(training$displacement,i)
      mod <- lm(mpg ~ displacement,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
                           upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1", 
                           labs_x)))
      testing <- Auto1[-x$in_id,]
      displacement_new<-cut(testing$displacement,c(labs_x_breaks[1,1],
                                               labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(displacement=displacement_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
      metric <- c(rmse)
      names(metric) <- c("rmse")
  return(metric)
 }
)
    
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
  
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}
cv_pc(Auto1)
##   breaks     rmse
## 1      9 4.220435

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 9.

ggplot(Auto1,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="brown") +
  stat_smooth(method = "lm",formula = y~cut(x,9),lty = 1, col = "blue",se = F) +
  theme_bw()

Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 9. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 9 membentuk pola dan mengikuti sebaran data.

3. Natural cubic splines

Penentuan banyaknya basis

set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 2:9
best.spline3 <- map_dfr(df, function(i){
  metric.spline3 <- map_dfr(cross.val$splits,
  function(x){
  mod <- lm(mpg ~ ns(displacement,df=i),data=Auto1[x$in_id,])
  pred <- predict(mod,newdata=Auto1[-x$in_id,])
  truth <- Auto1[-x$in_id,]$mpg
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  metric <- c(rmse)
  names(metric) <- c("rmse")
  return(metric)
  }
)
metric.spline3
    
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3) 
mean.metric.spline3
})

best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse))) 
basis
##   df     rmse
## 1  9 4.169282

Menentukan knot berdasarkan basis terpilih yaitu 9

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$displacement, df=9),"knots")
## 11.11111% 22.22222% 33.33333% 44.44444% 55.55556% 66.66667% 77.77778% 88.88889% 
##   91.0000   98.0000  119.0000  140.0000  173.0000  232.0000  302.2222  350.0000

Menentukan knot secara terencana yaitu 4

#nilai knots yang terencana
attr(ns(Auto1$displacement, df=4),"knots")
##    25%    50%    75% 
## 105.00 151.00 275.75
# Knot dari basis terpilih
mod_spline3ns = lm(mpg ~ ns(displacement, knots =                         
                c(91,98,119,140,173,232,302.222,350)),data=Auto1)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = mpg ~ ns(displacement, knots = c(91, 98, 119, 140, 
##     173, 232, 302.222, 350)), data = Auto1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.490  -2.441  -0.441   2.118  20.226 
## 
## Coefficients:
##                                                                        Estimate
## (Intercept)                                                             25.5644
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1   3.0660
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2  -0.2712
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3   0.9362
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4  -4.0868
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5  -6.0371
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6  -9.9299
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 -15.4379
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8  -1.9057
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 -18.2925
##                                                                        Std. Error
## (Intercept)                                                                1.6984
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1     1.7769
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2     2.2002
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3     2.0436
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4     2.5610
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5     2.1810
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6     2.1083
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7     1.6746
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8     3.9042
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9     1.7370
##                                                                        t value
## (Intercept)                                                             15.052
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1   1.725
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2  -0.123
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3   0.458
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4  -1.596
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5  -2.768
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6  -4.710
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7  -9.219
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8  -0.488
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 -10.531
##                                                                        Pr(>|t|)
## (Intercept)                                                             < 2e-16
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1  0.08526
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2  0.90196
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3  0.64713
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4  0.11135
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5  0.00591
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 3.47e-06
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7  < 2e-16
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8  0.62574
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9  < 2e-16
##                                                                           
## (Intercept)                                                            ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 .  
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2    
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3    
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4    
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 ** 
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8    
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.163 on 382 degrees of freedom
## Multiple R-squared:  0.722,  Adjusted R-squared:  0.7155 
## F-statistic: 110.2 on 9 and 382 DF,  p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(displacement, knots=c(105,151,275.75)),data=Auto1)
summary(mod_spline3ns1)
## 
## Call:
## lm(formula = mpg ~ ns(displacement, knots = c(105, 151, 275.75)), 
##     data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7432  -2.3595  -0.2352   2.0952  20.4131 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                      33.999      1.070  31.761
## ns(displacement, knots = c(105, 151, 275.75))1  -12.157      1.268  -9.588
## ns(displacement, knots = c(105, 151, 275.75))2  -17.094      1.346 -12.696
## ns(displacement, knots = c(105, 151, 275.75))3  -24.535      2.502  -9.808
## ns(displacement, knots = c(105, 151, 275.75))4  -18.385      1.341 -13.706
##                                                Pr(>|t|)    
## (Intercept)                                      <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))1   <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))2   <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))3   <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.368 on 387 degrees of freedom
## Multiple R-squared:  0.6901, Adjusted R-squared:  0.6869 
## F-statistic: 215.4 on 4 and 387 DF,  p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 3:12
best_spline3 <- map_dfr(df, function(i){
  metric_spline3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg~ ns(displacement,df=i),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
  }
)
metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})
# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3
##    df     rmse
## 1   3 4.325494
## 2   4 4.340964
## 3   5 4.301844
## 4   6 4.226619
## 5   7 4.194683
## 6   8 4.188847
## 7   9 4.169282
## 8  10 4.160931
## 9  11 4.126660
## 10 12 4.116457
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)
##   df     rmse
## 1 12 4.116457

Secara empiris berdasarkan rmse terbaik dengan df=12

dp8 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =                             c(91,98,119,140,173,232,302.222,350)),lty = 1,se=F) +
              theme_bw()

dp3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =  
              c(105,151,275.75)),lty = 1,se=F) + theme_bw()

plot_grid(dp8,dp3)

Dari visual yang terbaik adalah displacement dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.

Perbandingan Model

perbandinganAutodp <- rbind(best.poly, cv_pc(Auto)[1,-1],
                      best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutodp$metode <- c("Polinomial ordo 2", "Piecewise Constant (breaks=9)", "Natural Cubic Splines 3 Knot (df=12)")
perbandinganAutodp
##       rmse                               metode
## 1 4.325171                    Polinomial ordo 2
## 2 4.220435        Piecewise Constant (breaks=9)
## 3 4.116457 Natural Cubic Splines 3 Knot (df=12)

Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=12).

mpg (Y) vs weight (X3)

1. Regresi Polinomial

cv_poly1 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly1 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(weight,1,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1) 
return(poly1) 
}

cv_poly2 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly2 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(weight,2,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2) 
return(poly2) 
}

cv_poly3 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(weight,3,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3) 
return(poly3) 
}

cv_poly4 <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  metric_poly4 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg ~ poly(weight,4,raw = T),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
 }
)
  
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4) 
return(poly4) 
}
cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly
##          rmse
## [1,] 4.300195
## [2,] 4.152781
## [3,] 4.158427
## [4,] 4.165461
best.poly <- min(cv_poly)
best.poly
## [1] 4.152781

Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 2 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 2.

poly.1 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.2 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.3 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

poly.4 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
                color="brown") + stat_smooth(method = "lm", formula 
                =y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
         theme_bw()

plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)

2. Piecewise Constant

cv_pc <- function(Auto1){
  set.seed(2154)
  cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
  breaks <- 3:12
  best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,function(x){
      training <- Auto[x$in_id,]
      training$weight <- cut(training$weight,i)
      mod <- lm(mpg ~ weight,data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
                           upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1", 
                           labs_x)))
      testing <- Auto1[-x$in_id,]
      weight_new<-cut(testing$weight,c(labs_x_breaks[1,1],
                                               labs_x_breaks[,2]))
      pred <- predict(mod,newdata=list(weight=weight_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
      metric <- c(rmse)
      names(metric) <- c("rmse")
  return(metric)
 }
)
    
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
  
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}
cv_pc(Auto1)
##   breaks    rmse
## 1     12 4.18149

Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 12.

ggplot(Auto1,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="brown") +
  stat_smooth(method = "lm",formula = y~cut(x,12),lty = 1, col = "blue",se = F) +
  theme_bw()

Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 12. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 12 membentuk pola dan mengikuti sebaran data.

3. Natural cubic splines

Penentuan banyaknya basis

set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 2:12
best.spline3 <- map_dfr(df, function(i){
  metric.spline3 <- map_dfr(cross.val$splits,function(x){
  mod <- lm(mpg ~ ns(weight,df=i),data=Auto1[x$in_id,])
  pred <- predict(mod,newdata=Auto1[-x$in_id,])
  truth <- Auto1[-x$in_id,]$mpg
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  metric <- c(rmse)
  names(metric) <- c("rmse")
  return(metric)
  }
)
metric.spline3
    
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3) 
mean.metric.spline3
})

best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse))) 
basis
##   df     rmse
## 1  7 4.151979

Menentukan knot berdasarkan basis terpilih yaitu 7

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$weight, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286

Menentukan knot yang terencana yaitu 4

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$weight, df=4),"knots")
##     25%     50%     75% 
## 2225.25 2803.50 3614.75
# Knot dari basis yang terpilih
mod_spline3ns = lm(mpg ~ ns(weight, knots =                         
                c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),
                data=Auto1)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = mpg ~ ns(weight, knots = c(2074.857, 2285.429, 2635, 
##     2986.571, 3446.143, 4096.286)), data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3314  -2.5302  -0.4222   1.8281  16.3921 
## 
## Coefficients:
##                                                                                Estimate
## (Intercept)                                                                      31.575
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1   -6.249
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2   -4.599
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3  -10.661
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4  -13.286
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5  -19.001
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6  -15.370
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7  -22.327
##                                                                                Std. Error
## (Intercept)                                                                         2.038
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1      1.933
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2      2.514
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3      2.255
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4      2.378
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5      1.727
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6      4.628
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7      1.958
##                                                                                t value
## (Intercept)                                                                     15.495
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1  -3.233
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2  -1.829
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3  -4.728
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4  -5.587
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 -11.005
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6  -3.321
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 -11.401
##                                                                                Pr(>|t|)
## (Intercept)                                                                     < 2e-16
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 0.001332
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 0.068144
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 3.19e-06
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 4.39e-08
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5  < 2e-16
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 0.000982
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7  < 2e-16
##                                                                                   
## (Intercept)                                                                    ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 ** 
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 .  
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.143 on 384 degrees of freedom
## Multiple R-squared:  0.7233, Adjusted R-squared:  0.7183 
## F-statistic: 143.4 on 7 and 384 DF,  p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(weight, knots = c(2225.25,2803.5,614.75)),
                data=Auto1)
summary(mod_spline3ns1)
## 
## Call:
## lm(formula = mpg ~ ns(weight, knots = c(2225.25, 2803.5, 614.75)), 
##     data = Auto1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7276  -2.7113  -0.4017   1.8321  16.2330 
## 
## Coefficients: (1 not defined because of singularities)
##                                                 Estimate Std. Error t value
## (Intercept)                                      -41.063      4.220  -9.730
## ns(weight, knots = c(2225.25, 2803.5, 614.75))1   57.013      3.044  18.728
## ns(weight, knots = c(2225.25, 2803.5, 614.75))2   16.079      2.332   6.895
## ns(weight, knots = c(2225.25, 2803.5, 614.75))3  149.194      9.914  15.049
## ns(weight, knots = c(2225.25, 2803.5, 614.75))4       NA         NA      NA
##                                                 Pr(>|t|)    
## (Intercept)                                      < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))1  < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))2 2.19e-11 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))3  < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))4       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.18 on 388 degrees of freedom
## Multiple R-squared:  0.7154, Adjusted R-squared:  0.7132 
## F-statistic: 325.1 on 3 and 388 DF,  p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 7:12
best_spline3 <- map_dfr(df, function(i){
  metric_spline3 <- map_dfr(cross_val$splits,function(x){
    mod <- lm(mpg~ ns(weight,df=i),data=Auto1[x$in_id,])
    pred <- predict(mod,newdata=Auto1[-x$in_id,])
    truth <- Auto1[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    metric <- c(rmse)
    names(metric) <- c("rmse")
  return(metric)
  }
)
metric_spline3

# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})
# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3
##   df     rmse
## 1  7 4.151979
## 2  8 4.171000
## 3  9 4.180805
## 4 10 4.166101
## 5 11 4.168050
## 6 12 4.186651
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)
##   df     rmse
## 1  7 4.151979

Secara empiris berdasarkan rmse terbaik dengan df=7

w6 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =                             c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),lty =
              1,se=F) + theme_bw()

w3 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
              geom_point(alpha=0.55, color="brown")+
              stat_smooth(method = "lm",formula = y~ns(x, knots =  
              c(2225.25,2803.5,614.75)),lty = 1,se=F) + theme_bw()

plot_grid(w6,w3)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Dari visual yang terbaik adalah weight dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.

Perbandingan Model

perbandinganAutow <- rbind(best.poly, cv_pc(Auto1)[1,-1],
                      best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutow$metode <- c("Polinomial ordo 2", "Piecewise Constant (breaks=12)", "Natural Cubic Splines 3 Knot (df=7)")
perbandinganAutow
##       rmse                              metode
## 1 4.152781                   Polinomial ordo 2
## 2 4.181490      Piecewise Constant (breaks=12)
## 3 4.151979 Natural Cubic Splines 3 Knot (df=7)

Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=7).

Plot Model Terbaik

hp_ncs3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55,color="brown") + 
  stat_smooth(method = "lm",formula = y ~ ns(x, knots = 
  c(75,93.5,126)),lty = 1,se=F) + theme_bw() +
  labs(title = "NCS 3 Knots (df=10)",subtitle = "mpg vs horsepower")

dp_ncs3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.55, color="brown") +
  stat_smooth(method = "lm",formula = y~ns(x, knots = c(105,151,275.75)),lty =    1,se=F) + theme_bw() +
  labs(title = "NCS 3 Knots (df=12)", subtitle = "mpg vs displacement")

w_ncs3 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
  geom_point(alpha=0.55, color="brown") + 
  stat_smooth(method = "lm",formula = y~ns(x, knots = 
  c(2225.25,2803.5,614.75)),lty = 1,se=F) + 
  theme_bw() +
  labs(title = "NCS 3 Knots (df=7)",subtitle = "mpg vs weight")

plot_grid(hp_ncs3, dp_ncs3, w_ncs3)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Perbandingan Model

A <- min(perbandinganAutohp[3,-2])
B <- min(perbandinganAutodp[3,-2])
C <- min(perbandinganAutow[3,-2])

perbandinganAll <- rbind(A,B,C)
perbandinganAll$metode <- c("Natural Cubic Splines 3 Knot  (df=10)", "Natural Cubic Splines 3 Knot  (df=12)","Natural Cubic Splines 3 Knot (df=7)")
## Warning in perbandinganAll$metode <- c("Natural Cubic Splines 3 Knot (df=10)", :
## Coercing LHS to a list
perbandinganAll
## [[1]]
## [1] 4.273561
## 
## [[2]]
## [1] 4.116457
## 
## [[3]]
## [1] 4.151979
## 
## $metode
## [1] "Natural Cubic Splines 3 Knot  (df=10)"
## [2] "Natural Cubic Splines 3 Knot  (df=12)"
## [3] "Natural Cubic Splines 3 Knot (df=7)"

Kesimpulan:

Dari ketiga variabel prediktor, nilai RMSE terkecil terhadap mpg yaitu variabel X2 atau displacement yang terbaik sebesar 4.116457.

Referensi

Sartono, Bagus. 2020. https://rpubs.com/bagusco/taklinear.

Dito, Gerry Alfa. https://rpubs.com/gdito/regresi-nonlinear1.