Moving Beyond Linearity

Andika Putri Ratnasari (G1501211018)

2021-10-27

DATA

Data yang digunakan yaitu data Auto. Data ini berisi tentang informasi mengenai 392 jenis kendaraan. Berikut adalah penjelasan untuk masing-masing variabel dalam data:

mpg: miles per gallon
cylinders: Number of cylinders between 4 and 8
displacement: Engine displacement (cu. inches)
horsepower: Engine horsepower
weight: Vehicle weight (lbs.)
acceleration: Time to accelerate from 0 to 60 mph (sec.)
year: Model year (modulo 100)
origin: Origin of car (1. American, 2. European, 3. Japanese)
name: Vehicle name

Peubah respon yang digunakan yaitu mpg (miles per gallon) sedangkan peubah penjelasnya yaitu horsepower.

library(tidyverse)
library(ggplot2)
library(splines)
library(rsample)
library(ISLR)
library(cowplot)
library(caret) #cross-validation
library(gridExtra) #combining graphs
library(ggplot2)
library(dplyr)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)
library(gam)
library(ggpubr)

dataauto = Auto %>%  select(mpg, horsepower, origin)
head(dataauto)

##   mpg horsepower origin
## 1  18        130      1
## 2  15        165      1
## 3  18        150      1
## 4  16        150      1
## 5  17        140      1
## 6  15        198      1

Pemodelan mpg vs horsepower

Plot mpg vs horsepower

ggplot(dataauto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="navy") +
  theme_bw()

Regresi Polinomial

set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")

derajat <- 1:5

polinomial <- map_dfr(derajat, function(i) {
  metric_poli <- map_dfr(val_silang$splits,
                        function(x) {
                        mod <- lm(mpg ~ poly(horsepower,derajat=i), data=Auto[x$in_id,])
                        predik <- predict(mod, newdata=Auto[-x$in_id,])
                        truth <- Auto[-x$in_id,]$mpg
                        rmse <- mlr3measures::rmse(truth = truth, response = predik)
                        mae <- mlr3measures::mae(truth = truth, response = predik)
                        metric <- c(rmse,mae)
                        names(metric) <- c("RMSE","MAE")
                        return(metric)
                              }
                            )
  metric_poli
  mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
  mean_metric_poli
  }
)

polinomial<- cbind(derajat=derajat,polinomial)

Perbandingan RMSE dan MAE

datatable(polinomial)

Plot RMSE dan MAE

polinomial$derajat <- as.factor(polinomial$derajat)
prmse<-ggplot(data=polinomial, aes(x=derajat, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

polinomial$derajat <- as.factor(polinomial$derajat)
pmae<-ggplot(data=polinomial, aes(x=derajat, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Mean Absolute Error")

plot_grid(prmse,pmae, labels=c("RMSE","MAE"), label_size = 6)

Plot Regresi Polinomial d=2 dan d=5

regpol2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

regpol5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,5,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 5") +
  xlab("Horse Power") + 
  ylab("mile per gallon")

plot_grid(regpol2,regpol5)

Nilai RMSE dan MAE terendah terdapat pada regresi polinomial derajat 5, namun apabila dilihat dari plot regresi pada bagian ujung-ujung plot cenderung tidak teratur untuk regresi polinomial derajat 5. Oleh karena itu, regresi polinomial terbaik yaitu regresi polinomial derajat 2. Berikut merupakan summary dari model regresi polinomial derajat 2:

poli2 = lm(mpg ~ poly(horsepower,2), data = Auto)
summary(poli2)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Piecewise Constant

set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")

breakk <- 2:11 #knots 1-10

pemilihanf.tangga <- map_dfr(breakk, function(i){
  metric_ftangga <- map_dfr(val_silang$splits,
                          function(x){
                          datatrain <- Auto[x$in_id,]
                          datatrain$horsepower <- cut(datatrain$horsepower,i)
                          modelt <- lm(mpg ~ horsepower, data=datatrain)
                          labs_x <- levels(modelt$model[,2])
                          labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                            upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                          datatesting <- Auto[-x$in_id,]
                          horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
                          predik <- predict(modelt,   newdata=list(horsepower=horsepower_new))
                          truth <- datatesting$mpg
                          evaluasi <- na.omit(data.frame(truth,predik))
                          rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
                          mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )

metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanf.tangga <- cbind(breakk=breakk,pemilihanf.tangga)

Perbandingan RMSE dan MAE

datatable(pemilihanf.tangga)

Plot RMSE dan MAE

pemilihanf.tangga$breakk <- as.factor(pemilihanf.tangga$breakk)
trmse<-ggplot(data=pemilihanf.tangga, aes(x=breakk, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

pemilihanf.tangga$breakk <- as.factor(pemilihanf.tangga$breakk)
tmae<-ggplot(data=pemilihanf.tangga, aes(x=breakk, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Mean Absolute Error")

plot_grid(trmse,tmae)

Berdasarkan nilai RMSE model piecewise constant terbaik adalah piecewise constant dengan knots=7, sedangkan berdasarkan nilai MAE model terbaik adalah piecewise constant dengan knots=6.

Plot Piecewise Constant

ft7<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,7), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=6") +
  xlab("Horse Power") + ylab("mile per gallon")

ft8<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,8), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=7") +
  xlab("Horse Power") + ylab("mile per gallon")

plot_grid(ft7,ft8)

Apabila dilihat secara visual, plot dari piecewise constant dengan knots 7 memiliki kurva yang lebih mulus dibandingkan dengan knots 6, selain itu juga tidak terdapat pola yang tidak teratur pada plot dengan knots=7. Jadi, pada metode piecewise constant ini dipilih knots=7.

Berikut merupakan summary model piecewise constant dengan knots=6:

fungsitangga7 = lm(Auto$mpg ~ cut(Auto$horsepower,7)) #6 knots
summary(fungsitangga7)

## 
## Call:
## lm(formula = Auto$mpg ~ cut(Auto$horsepower, 7))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         32.5280     0.5069   64.17   <2e-16 ***
## cut(Auto$horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(Auto$horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(Auto$horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(Auto$horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(Auto$horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(Auto$horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16

Natural Cubic Splines

set.seed(018)
val_silang <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:6 #knots 1-5

pemilihan.ncs <- map_dfr(df, function(i) {
  metric_ncs<- map_dfr(val_silang$splits,
                          function(x) {
                          modelns <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
                          predik <- predict(modelns, newdata=Auto[-x$in_id,])
                          truth <- Auto[-x$in_id,]$mpg
                          rmse <- mlr3measures::rmse(truth = truth, response = predik)
                          mae <- mlr3measures::mae(truth = truth, response = predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )
  metric_ncs

# menghitung rata-rata untuk 10 folds
  mean_metric_ncs <- colMeans(metric_ncs)
  mean_metric_ncs
  }
)

pemilihan.ncs <- cbind(df=df,pemilihan.ncs)

Perbandingan RMSE dan MAE

datatable(pemilihan.ncs)

Plot RMSE dan MAE

pemilihan.ncs$df <- as.factor(pemilihan.ncs$df)
nrmse<-ggplot(data=pemilihan.ncs, aes(x=df, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

nmae<-ggplot(data=pemilihan.ncs, aes(x=df, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error")

plot_grid(nrmse,nmae)

Berdasarkan hasil di atas dapat disimpulkan bahwa Model Natural Cubic Splines terbaik adalah dengan df=6. Berikut merupakan knots dalam Natural Cubic Splines (df=6):

Knots

attr(ns(Auto$horsepower, df=6),"knots")

## 16.66667% 33.33333%       50% 66.66667% 83.33333% 
##      70.0      84.0      93.5     110.0     150.0

Plot Natural Cubic Splines

plotncs <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots=c(70.0, 84.0, 93.5, 110.0,150.0)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines with df=6") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(70.0, 84.0, 93.5, 110.0,150.0), col="grey50", lty=2)
plotncs

ncspline = lm(Auto$mpg ~ ns(Auto$horsepower, knots = c(70.0, 84.0, 93.5, 110.0,150.0)))
summary(ncspline)

## 
## Call:
## lm(formula = Auto$mpg ~ ns(Auto$horsepower, knots = c(70, 84, 
##     93.5, 110, 150)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.9491  -2.6183  -0.1595   2.3508  15.1349 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                               34.738      1.509
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))1   -8.210      1.594
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))2  -13.046      1.835
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))3  -14.577      1.886
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))4  -22.802      1.624
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))5  -22.758      3.512
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))6  -20.849      1.742
##                                                         t value Pr(>|t|)    
## (Intercept)                                              23.021  < 2e-16 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))1  -5.149 4.18e-07 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))2  -7.108 5.76e-12 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))3  -7.730 9.50e-14 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))4 -14.039  < 2e-16 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))5  -6.480 2.81e-10 ***
## ns(Auto$horsepower, knots = c(70, 84, 93.5, 110, 150))6 -11.967  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.302 on 385 degrees of freedom
## Multiple R-squared:  0.7009, Adjusted R-squared:  0.6962 
## F-statistic: 150.4 on 6 and 385 DF,  p-value: < 2.2e-16

Perbandingan Model

nilai_rmse <- rbind(4.32756231703042,
                    4.41347139979508,
                    4.2917765537155)

nilai_mae<-rbind(3.2639553356593,
                 3.41081122407494,
                 3.24670654206857)

Metode <- c("Regresi Polinomial (d=2)",
                "Piecewise Constant (knots=7)",
                "Natural Cubic Splines (df=6)")

perbandingan <- data.frame(Metode,nilai_rmse, nilai_mae)
datatable(perbandingan)

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+  
  stat_smooth(method = "lm", 
              formula = y~cut(x,8), 
              lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
  stat_smooth(method = "lm", 
           formula = y~ns(x, df=6), 
           lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
  
  scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model terbaik untuk data Auto dengan peubah respon mpg dan peubah penjelas horsepower adalah Natural Cubic Splines dengan df=6.

Pemodelan mpg vs horsepower (Amerika)

Data

dataAmerika = Auto[Auto$origin==1,] %>%  select(mpg, horsepower, origin)
datatable(dataAmerika)

ggplot(dataAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="navy") +
  theme_bw() +
   labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
        subtitle = "Amerika")

Regresi Polinomial

set.seed(018)
val_silang <- vfold_cv(data = dataAmerika,v=10)

derajat <- 1:5

polinomial1 <- map_dfr(derajat, function(i) {
  metric_poli <- map_dfr(val_silang$splits,
                        function(x) {
                        mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataAmerika[x$in_id,])
                        predik <- predict(mod, newdata=dataAmerika[-x$in_id,])
                        truth <- dataAmerika[-x$in_id,]$mpg
                        rmse <- mlr3measures::rmse(truth = truth, response = predik)
                        mae <- mlr3measures::mae(truth = truth, response = predik)
                        metric <- c(rmse,mae)
                        names(metric) <- c("RMSE","MAE")
                        return(metric)
                              }
                            )
  metric_poli
  mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
  mean_metric_poli
  }
)

polinomial1<- cbind(derajat=derajat,polinomial1)

Perbandingan RMSE dan MAE untuk setiap derajat Regresi Polinomial

datatable(polinomial1)

Plot RMSE dan MAE

polinomial1$derajat <- as.factor(polinomial1$derajat)
prmse1<-ggplot(data=polinomial1, aes(x=derajat, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

polinomial1$derajat <- as.factor(polinomial1$derajat)
pmae1<-ggplot(data=polinomial1, aes(x=derajat, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Mean Absolute Error")

plot_grid(prmse1,pmae1)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa regresi polinomial terbaik adalah dengan derajat 2.

Plot Regresi Polinomial Derajat 2

regpol12 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

regpol12

poli12 = lm(mpg ~ poly(horsepower,2), data = dataAmerika)
summary(poli12)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = dataAmerika)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9497  -2.5745  -0.0895   2.1583  13.1866 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.0335     0.2442   82.04  < 2e-16 ***
## poly(horsepower, 2)1 -75.6095     3.8223  -19.78  < 2e-16 ***
## poly(horsepower, 2)2  29.4684     3.8223    7.71 3.26e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.822 on 242 degrees of freedom
## Multiple R-squared:  0.6507, Adjusted R-squared:  0.6478 
## F-statistic: 225.4 on 2 and 242 DF,  p-value: < 2.2e-16

Piecewise Constant

set.seed(018)
val_silang <- vfold_cv(dataAmerika,v=10,strata = "mpg")

breakk <- 2:11 #knots 1-10

pemilihanf.tangga1 <- map_dfr(breakk, function(i){
  metric_ftangga <- map_dfr(val_silang$splits,
                          function(x){
                          datatrain <- dataAmerika[x$in_id,]
                          datatrain$horsepower <- cut(datatrain$horsepower,i)
                          modelt <- lm(mpg ~ horsepower, data=datatrain)
                          labs_x <- levels(modelt$model[,2])
                          labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                            upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                          datatesting <- dataAmerika[-x$in_id,]
                          horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
                          predik <- predict(modelt,   newdata=list(horsepower=horsepower_new))
                          truth <- datatesting$mpg
                          evaluasi <- na.omit(data.frame(truth,predik))
                          rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
                          mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )

metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanfs.tangga1 <- cbind(breakk=breakk,pemilihanf.tangga1)

Perbandingan RMSE dan MAE

datatable(pemilihanfs.tangga1)

Plot RMSE dan MAE

pemilihanfs.tangga1$breakk <- as.factor(pemilihanfs.tangga1$breakk)
trmse1<-ggplot(data=pemilihanfs.tangga1, aes(x=breakk, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

pemilihanfs.tangga1$breakk <- as.factor(pemilihanfs.tangga1$breakk)
tmae1<-ggplot(data=pemilihanfs.tangga1, aes(x=breakk, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Mean Absolute Error")

plot_grid(trmse1,tmae1)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model piecewise constant terbaik adalah dengan knots=8.

Plot Piecewise Constant

ft9<- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=8") +
  xlab("Horse Power") + ylab("mile per gallon")

ft9

fungsitangga9 = lm(dataAmerika$mpg ~ cut(dataAmerika$horsepower,9))
summary(fungsitangga9)

## 
## Call:
## lm(formula = dataAmerika$mpg ~ cut(dataAmerika$horsepower, 9))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.946 -1.946 -0.375  2.077 13.054 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                32.5933     0.9521   34.23  < 2e-16
## cut(dataAmerika$horsepower, 9)(71.8,91.6]  -7.6477     1.0519   -7.27 5.24e-12
## cut(dataAmerika$horsepower, 9)(91.6,111]  -12.6916     1.0682  -11.88  < 2e-16
## cut(dataAmerika$horsepower, 9)(111,131]   -13.8267     1.3465  -10.27  < 2e-16
## cut(dataAmerika$horsepower, 9)(131,151]   -17.1706     1.1025  -15.57  < 2e-16
## cut(dataAmerika$horsepower, 9)(151,171]   -18.3933     1.2892  -14.27  < 2e-16
## cut(dataAmerika$horsepower, 9)(171,190]   -18.9010     1.3973  -13.53  < 2e-16
## cut(dataAmerika$horsepower, 9)(190,210]   -21.2600     1.7813  -11.94  < 2e-16
## cut(dataAmerika$horsepower, 9)(210,230]   -19.2183     1.6144  -11.90  < 2e-16
##                                              
## (Intercept)                               ***
## cut(dataAmerika$horsepower, 9)(71.8,91.6] ***
## cut(dataAmerika$horsepower, 9)(91.6,111]  ***
## cut(dataAmerika$horsepower, 9)(111,131]   ***
## cut(dataAmerika$horsepower, 9)(131,151]   ***
## cut(dataAmerika$horsepower, 9)(151,171]   ***
## cut(dataAmerika$horsepower, 9)(171,190]   ***
## cut(dataAmerika$horsepower, 9)(190,210]   ***
## cut(dataAmerika$horsepower, 9)(210,230]   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.688 on 236 degrees of freedom
## Multiple R-squared:  0.6829, Adjusted R-squared:  0.6722 
## F-statistic: 63.53 on 8 and 236 DF,  p-value: < 2.2e-16

Natural Cubic Splines

set.seed(018)
val_silang <- vfold_cv(dataAmerika,v=10,strata = "mpg")

df <- 2:6 

pemilihan.ncs1 <- map_dfr(df, function(i) {
  metric_ncs<- map_dfr(val_silang$splits,
                          function(x) {
                          modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataAmerika[x$in_id,])
                          predik <- predict(modelns, newdata=dataAmerika[-x$in_id,])
                          truth <- dataAmerika[-x$in_id,]$mpg
                          rmse <- mlr3measures::rmse(truth = truth, response = predik)
                          mae <- mlr3measures::mae(truth = truth, response = predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )
  metric_ncs

# menghitung rata-rata untuk 10 folds
  mean_metric_ncs <- colMeans(metric_ncs)
  mean_metric_ncs
  }
)

pemilihan.ncs1 <- cbind(df=df,pemilihan.ncs1)

Perbandingan RMSE dan MAE

datatable(pemilihan.ncs1)

Plot RMSE

pemilihan.ncs1$df <- as.factor(pemilihan.ncs1$df)
nrmse1<-ggplot(data=pemilihan.ncs1, aes(x=df, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

pemilihan.ncs1$df <- as.factor(pemilihan.ncs1$df)
nmae1<-ggplot(data=pemilihan.ncs1, aes(x=df, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error")

plot_grid(nrmse1,nmae1)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model natural cubic splines terbaik adalah dengan df=5.

Knots

attr(ns(dataAmerika$horsepower, df=5),"knots")

##   20%   40%   60%   80% 
##  85.0  99.2 122.0 150.0

Plot Natural Cubic Splines

plotncs1 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots = c(72,88,100,140)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines with df=5") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(72,88,100,140), col="grey50", lty=2)
plotncs1

ncspline51 = lm(dataAmerika$mpg ~ ns(dataAmerika$horsepower, knots = c(72,88,100,140)))
summary(ncspline51)

## 
## Call:
## lm(formula = dataAmerika$mpg ~ ns(dataAmerika$horsepower, knots = c(72, 
##     88, 100, 140)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.933  -2.229  -0.347   2.029  13.181 
## 
## Coefficients:
##                                                          Estimate Std. Error
## (Intercept)                                                32.702      2.898
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))1   -8.755      2.767
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))2  -13.890      3.190
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))3  -18.333      1.898
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))4  -22.082      6.399
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))5  -18.666      1.756
##                                                          t value Pr(>|t|)    
## (Intercept)                                               11.285  < 2e-16 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))1  -3.164  0.00176 ** 
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))2  -4.353 1.99e-05 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))3  -9.660  < 2e-16 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))4  -3.451  0.00066 ***
## ns(dataAmerika$horsepower, knots = c(72, 88, 100, 140))5 -10.630  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.814 on 239 degrees of freedom
## Multiple R-squared:  0.6565, Adjusted R-squared:  0.6493 
## F-statistic: 91.35 on 5 and 239 DF,  p-value: < 2.2e-16

Perbandingan Model

nilai_rmse <- rbind(polinomial1 %>% select(-1) %>% slice_min(RMSE),
                      pemilihanfs.tangga1 %>% select(-1) %>% slice_min(RMSE),
                      pemilihan.ncs1 %>%  select(-1)%>% slice_min(RMSE)
                      )

Metode <- c("Regresi Polinomial (d=2)",
                "Piecewise Constant (knots=8)",
                "Natural Cubic Splines (df=5)")

perbandingan1 <- data.frame(Metode,nilai_rmse)
datatable(perbandingan1)

ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+  
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
  stat_smooth(method = "lm", 
           formula = y~ns(x, df=5), 
           lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
  
  scale_color_manual(values = c("Regresi Polynomial"="coral","Piecewise Constant"="navy","Natural Cubic Spline"="green"))+theme_bw()

Berdasarkan nilai RMSE dapat disimpulkan bahwa model terbaik untuk memodelkan Data Auto dengan peubah respon mpg dan peubah penjelas horsepower dari kendaraan yang berasal dari Amerika adalah model Piecewise Constant dengan knots =6, sedangkan berdasarkan nilai MAE model terbaiknya adalah Natural Cubic Splines dengan df=6.

Pemodelan mpg vs horsepower (Eropa)

Data

dataEropa = Auto[Auto$origin==2,] %>%  select(mpg, horsepower, origin)
datatable(dataEropa)

ggplot(dataEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="red") +
  theme_bw() +
   labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
        subtitle = "Eropa")

Regresi Polinomial

set.seed(018)
val_silang <- vfold_cv(data = dataEropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

derajat <- 1:5

polinomial2 <- map_dfr(derajat, function(i) {
  metric_poli <- map_dfr(val_silang$splits,
                        function(x) {
                        mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataEropa[x$in_id,])
                        predik <- predict(mod, newdata=dataEropa[-x$in_id,])
                        truth <- dataEropa[-x$in_id,]$mpg
                        rmse <- mlr3measures::rmse(truth = truth, response = predik)
                        mae <- mlr3measures::mae(truth = truth, response = predik)
                        metric <- c(rmse,mae)
                        names(metric) <- c("RMSE","MAE")
                        return(metric)
                              }
                            )
  metric_poli
  mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
  mean_metric_poli
  }
)

polinomial2<- cbind(derajat=derajat,polinomial2)

Perbandingan RMSE dan MAE

datatable(polinomial2)

Plot RMSE dan MAE

polinomial2$derajat <- as.factor(polinomial2$derajat)
nrmse2<-ggplot(data=polinomial2, aes(x=derajat, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

polinomial2$derajat <- as.factor(polinomial2$derajat)
nmae2<-ggplot(data=polinomial2, aes(x=derajat, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Mean Absolute Error")

plot_grid(nrmse2,nmae2)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model Regresi Polinomial terbaik adalah Regresi Polinomial derajat 1.

Plot Regresi Polinomial derajat 1

regpol12 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,1,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial d=1") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

regpol12

poli1 = lm(mpg ~ poly(horsepower,1), data = dataEropa)
summary(poli1)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 1), data = dataEropa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4946  -2.8387  -0.4171   2.2148  12.8858 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          27.6029     0.5898  46.800  < 2e-16 ***
## poly(horsepower, 1) -36.6027     4.8637  -7.526 1.87e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.864 on 66 degrees of freedom
## Multiple R-squared:  0.4618, Adjusted R-squared:  0.4537 
## F-statistic: 56.64 on 1 and 66 DF,  p-value: 1.869e-10

Piecewise Constant

set.seed(018)

val_silang <- vfold_cv(dataEropa,v=10, strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

breakk <- 2:9 #knots 1-8

pemilihanf.tangga2 <- map_dfr(breakk, function(i){
  metric_ftangga <- map_dfr(val_silang$splits,
                          function(x){
                          datatrain2 <- dataEropa[x$in_id,]
                          datatrain2$horsepower <- cut(datatrain2$horsepower,i)
                          modelt <- lm(mpg ~ horsepower, data=datatrain2)
                          labs_x <- levels(modelt$model[,2])
                          labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                            upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                          datatesting2 <- dataEropa[-x$in_id,]
                          horsepower_new <- cut(datatesting2$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
                          predik <- predict(modelt,   newdata=list(horsepower=horsepower_new))
                          truth <- datatesting2$mpg
                          evaluasi <- na.omit(data.frame(truth,predik))
                          rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
                          mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )

metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})

pemilihanfs.tangga2 <- cbind(breakk=breakk,pemilihanf.tangga2)

Perbandingan RMSE dan MAE

datatable(pemilihanfs.tangga2)

Plot RMSE dan MAE

pemilihanfs.tangga2$breakk <- as.factor(pemilihanfs.tangga2$breakk)
trmse2<-ggplot(data=pemilihanfs.tangga2, aes(x=breakk, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

pemilihanfs.tangga2$breakk <- as.factor(pemilihanfs.tangga2$breakk)
tmae2<-ggplot(data=pemilihanfs.tangga2, aes(x=breakk, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Mean Absolute Error")

plot_grid(trmse2,tmae2)

Berdasarkan nilai RMSE model piecewise constant terbaik adalah piecewise constant dengan knots=1, sedangkan berdasarkan nilai MAE model terbaik adalah piecewise constant dengan knots=6.

Plot Piecewise Constant

ft12<- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,2), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=1") +
  xlab("Horse Power") + ylab("mile per gallon")

ft62<- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,7), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=6") +
  xlab("Horse Power") + ylab("mile per gallon")

plot_grid(ft12,ft62)

Apabila dilihat secara visual, plot dari piecewise constant dengan knots 6 memiliki kurva yang lebih mulus dibandingkan dengan knots 1, selain itu juga tidak terdapat pola yang tidak teratur pada plot dengan knots=6. Jadi, pada metode piecewise constant ini dipilih knots=6.

fungsitangga12 = lm(dataEropa$mpg ~ cut(dataEropa$horsepower,2))
summary(fungsitangga12)

## 
## Call:
## lm(formula = dataEropa$mpg ~ cut(dataEropa$horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8755  -3.8755  -0.8755   2.4745  14.4245 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                             29.8755     0.7855  38.035  < 2e-16 ***
## cut(dataEropa$horsepower, 2)(89.5,133]  -8.1334     1.4860  -5.473 7.35e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.498 on 66 degrees of freedom
## Multiple R-squared:  0.3122, Adjusted R-squared:  0.3018 
## F-statistic: 29.96 on 1 and 66 DF,  p-value: 7.352e-07

Natural Cubic Splines

set.seed(018)
val_silang <- vfold_cv(dataEropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

df <- 2:6

pemilihan.ncs2<- map_dfr(df, function(i) {
  metric_ncs<- map_dfr(val_silang$splits,
                          function(x) {
                          modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataEropa[x$in_id,])
                          predik <- predict(modelns, newdata=dataEropa[-x$in_id,])
                          truth <- dataEropa[-x$in_id,]$mpg
                          rmse <- mlr3measures::rmse(truth = truth, response = predik)
                          mae <- mlr3measures::mae(truth = truth, response = predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )
  metric_ncs

# menghitung rata-rata untuk 10 folds
  mean_metric_ncs <- colMeans(metric_ncs)
  mean_metric_ncs
  }
)

pemilihan.ncs2 <- cbind(df=df,pemilihan.ncs2)

Perbandingan RMSE dan MAE

datatable(pemilihan.ncs2)

Plot RMSE dan MAE

pemilihan.ncs2$df <- as.factor(pemilihan.ncs2$df)
nrmse2<-ggplot(data=pemilihan.ncs2, aes(x=df, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

pemilihan.ncs2$df <- as.factor(pemilihan.ncs2$df)
nmae2<-ggplot(data=pemilihan.ncs2, aes(x=df, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error")

plot_grid(nrmse2,nmae2)

Berdasarkan nilai RMSE dan MAE model natural cubic splines terbaik adalah natural cubic splines dengan df=2.

Knots

attr(ns(dataEropa$horsepower, df=2),"knots")

##  50% 
## 76.5

Plot Natural Cubic Splines

plotncs2 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots = 76.5), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines with DF=2") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(76.5), col="grey50", lty=2)
plotncs2

ncspline22 = lm(dataEropa$mpg ~ ns(dataEropa$horsepower, knots = 76.5))
summary(ncspline22)

## 
## Call:
## lm(formula = dataEropa$mpg ~ ns(dataEropa$horsepower, knots = 76.5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6482  -2.7674  -0.3836   2.2852  12.9801 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               35.550      1.739  20.443  < 2e-16
## ns(dataEropa$horsepower, knots = 76.5)1  -21.269      3.807  -5.587 4.90e-07
## ns(dataEropa$horsepower, knots = 76.5)2  -15.549      2.720  -5.717 2.95e-07
##                                            
## (Intercept)                             ***
## ns(dataEropa$horsepower, knots = 76.5)1 ***
## ns(dataEropa$horsepower, knots = 76.5)2 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.899 on 65 degrees of freedom
## Multiple R-squared:  0.4622, Adjusted R-squared:  0.4457 
## F-statistic: 27.93 on 2 and 65 DF,  p-value: 1.756e-09

Perbandingan Model

nilai_rmse2 <- rbind(polinomial2 %>% select(-1) %>% slice_min(RMSE),
                      pemilihanfs.tangga2 %>% select(-1) %>% slice_min(MAE),
                      pemilihan.ncs2%>%  select(-1)%>% slice_min(RMSE)
                      )

Metode <- c("Regresi Polinomial (d=1)",
                "Piecewise Constant (Knots=6)",
                "Natural Cubic Splines (df=2)")

perbandingan2 <- data.frame(Metode,nilai_rmse2)
datatable(perbandingan2)

ggplot(dataEropa,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+  
  stat_smooth(method = "lm", 
              formula = y~cut(x,5), 
              lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
  stat_smooth(method = "lm", 
           formula = y~ns(x, df=2), 
           lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
  
  scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()

Berdasarkan nilai RMSE dan MAE model terbaik untuk memodelkan data Auto dengan peubah respon mpg dan peubah penjelas horsepower dari kendaraan yang berasal dari Eropa adalah model Regresi polinomial derajat 1.

Pemodelan mpg vs horsepower (Jepang)

Data

dataJepang = Auto[Auto$origin==3,] %>%  select(mpg, horsepower, origin)
datatable(dataJepang)

ggplot(dataJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") + 
  theme_bw() +
   labs(y="Miles per Gallon", x="Horse Power", title = "Scatter Plot mpg dan horsepower",
        subtitle = "Jepang")

Regresi Polinomial

set.seed(018)
val_silang <- vfold_cv(data = dataJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

derajat <- 1:5

polinomial3 <- map_dfr(derajat, function(i) {
  metric_poli <- map_dfr(val_silang$splits,
                        function(x) {
                        mod <- lm(mpg ~ poly(horsepower,derajat=i), data=dataJepang[x$in_id,])
                        predik <- predict(mod, newdata=dataJepang[-x$in_id,])
                        truth <- dataJepang[-x$in_id,]$mpg
                        rmse <- mlr3measures::rmse(truth = truth, response = predik)
                        mae <- mlr3measures::mae(truth = truth, response = predik)
                        metric <- c(rmse,mae)
                        names(metric) <- c("RMSE","MAE")
                        return(metric)
                              }
                            )
  metric_poli
  mean_metric_poli<- colMeans(metric_poli) # menghitung rata-rata untuk 10 folds
  mean_metric_poli
  }
)

polinomial3<- cbind(derajat=derajat,polinomial3)

Perbandingan RMSE dan MAE

datatable(polinomial3)

Plot RMSE dan MAE

polinomial3$derajat <- as.factor(polinomial3$derajat)
prmse3<-ggplot(data=polinomial3, aes(x=derajat, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Root Mean Square Error")

polinomial3$derajat <- as.factor(polinomial3$derajat)
pmae3<-ggplot(data=polinomial3, aes(x=derajat, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Degree of Polynomial") + 
  ylab("Mean Absolute Error")

plot_grid(prmse3,pmae3)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model regresi polinomial terbaik adalah regresi poilinomial derajat 3.

Plot Regresi Polinomial derajat 3

regpol33 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="purple") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial d=3") +
  xlab("Horse Power") + 
  ylab("mile per gallon")  

regpol33

poli33 = lm(mpg ~ poly(horsepower,3), data = dataJepang)
summary(poli33)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = dataJepang)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8602 -2.7115 -0.5224  2.1985 11.6985 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           30.4506     0.4536  67.138  < 2e-16 ***
## poly(horsepower, 3)1 -36.2030     4.0312  -8.981 1.62e-13 ***
## poly(horsepower, 3)2   9.1966     4.0312   2.281   0.0254 *  
## poly(horsepower, 3)3  16.6991     4.0312   4.142 8.92e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.031 on 75 degrees of freedom
## Multiple R-squared:  0.5787, Adjusted R-squared:  0.5618 
## F-statistic: 34.34 on 3 and 75 DF,  p-value: 4.485e-14

Piecewise Constant

set.seed(018)
val_silang <- vfold_cv(dataJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

breakk <- 2:5

pemilihanf.tangga3 <- map_dfr(breakk, function(i){
  metric_ftangga <- map_dfr(val_silang$splits,
                          function(x){
                          datatrain <- dataJepang[x$in_id,]
                          datatrain$horsepower <- cut(datatrain$horsepower,i)
                          modelt <- lm(mpg ~ horsepower, data=datatrain)
                          labs_x <- levels(modelt$model[,2])
                          labs_x_b <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                            upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                          datatesting <- dataJepang[-x$in_id,]
                          horsepower_new <- cut(datatesting$horsepower,c(labs_x_b[1,1],labs_x_b[,2]))
                          predik <- predict(modelt,   newdata=list(horsepower=horsepower_new))
                          truth <- datatesting$mpg
                          evaluasi <- na.omit(data.frame(truth,predik))
                          rmse <- mlr3measures::rmse(truth = evaluasi$truth,response = evaluasi$predik)
                          mae <- mlr3measures::mae(truth = evaluasi$truth,response = evaluasi$predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )

metric_ftangga
mean_metricftangga <- colMeans(metric_ftangga)
mean_metricftangga
})
pemilihanfs.tangga3 <- cbind(breakk=breakk,pemilihanf.tangga3)

Perbandingan RMSE dan MAE

datatable(pemilihanfs.tangga3)

Plot RMSE dan MAE

pemilihanfs.tangga3$breakk <- as.factor(pemilihanfs.tangga3$breakk)
trmse3<-ggplot(data=pemilihanfs.tangga3, aes(x=breakk, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Root Mean Square Error")

pemilihanfs.tangga3$breakk <- as.factor(pemilihanfs.tangga3$breakk)
tmae3<-ggplot(data=pemilihanfs.tangga3, aes(x=breakk, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("Breaks") + 
  ylab("Mean Absolute Error")

plot_grid(trmse3,tmae3)

Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model Piecewise Constant terbaik adalah Piecewise Constant dengan knots=2.

Plot Fungsi Tangga

ft33<- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,3), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant with Knots=2") +
  xlab("Horse Power") + ylab("mile per gallon")

ft33

fungsitangga33 = lm(dataJepang$mpg ~ cut(dataJepang$horsepower,3))
summary(fungsitangga33)

## 
## Call:
## lm(formula = dataJepang$mpg ~ cut(dataJepang$horsepower, 3))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.08  -2.53  -1.08   1.97  12.52 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              34.0804     0.6381  53.409  < 2e-16
## cut(dataJepang$horsepower, 3)(78.7,105]  -8.3360     1.0492  -7.945 1.40e-11
## cut(dataJepang$horsepower, 3)(105,132]  -10.2804     1.8785  -5.473 5.47e-07
##                                            
## (Intercept)                             ***
## cut(dataJepang$horsepower, 3)(78.7,105] ***
## cut(dataJepang$horsepower, 3)(105,132]  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.328 on 76 degrees of freedom
## Multiple R-squared:  0.508,  Adjusted R-squared:  0.495 
## F-statistic: 39.23 on 2 and 76 DF,  p-value: 1.979e-12

Natural Cubic Splines

set.seed(018)
val_silang <- vfold_cv(dataJepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

df <- 2:6

pemilihan.ncs3<- map_dfr(df, function(i) {
  metric_ncs<- map_dfr(val_silang$splits,
                          function(x) {
                          modelns <- lm(mpg ~ ns(horsepower,df=i), data=dataJepang[x$in_id,])
                          predik <- predict(modelns, newdata=dataJepang[-x$in_id,])
                          truth <- dataJepang[-x$in_id,]$mpg
                          rmse <- mlr3measures::rmse(truth = truth, response = predik)
                          mae <- mlr3measures::mae(truth = truth, response = predik)
                          metric <- c(rmse,mae)
                          names(metric) <- c("RMSE","MAE")
                          return(metric)
                          }
                          )
  metric_ncs

# menghitung rata-rata untuk 10 folds
  mean_metric_ncs <- colMeans(metric_ncs)
  mean_metric_ncs
  }
)

pemilihan.ncs3 <- cbind(df=df,pemilihan.ncs3)

Perbandingan RMSE dan MAE

datatable(pemilihan.ncs3)

Plot RMSE dan MAE

pemilihan.ncs3$df <- as.factor(pemilihan.ncs3$df)
nrmse3<-ggplot(data=pemilihan.ncs3, aes(x=df, y=RMSE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Root Mean Square Error")

pemilihan.ncs3$df <- as.factor(pemilihan.ncs3$df)
nmae3<-ggplot(data=pemilihan.ncs3, aes(x=df, y=MAE, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ggtitle("K-Fold Cross Validation") +
  xlab("DF") + ylab("Mean Absolute Error")

plot_grid(nrmse3,nmae3)

Berdasarkan nilai RMSE dapat disimpulkan bahwa model natural cubic splines terbaik adalah natural cubic splines dengan df=3, sedangkan berdasarkan nilai MAE model terbaik adalah natural cubic splines dengan df=5.

Knots

attr(ns(dataJepang$horsepower, df=3),"knots")

## 33.33333% 66.66667% 
##        68        90

attr(ns(dataJepang$horsepower, df=5),"knots")

##  20%  40%  60%  80% 
## 65.0 69.2 88.0 96.0

Plot Natural Cubic Splines with df 3 and 5

plotncs3 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots=c(68,90)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines with DF=3") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(68,90), col="grey50", lty=2)

plotncs5 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots = c(65.0, 69.2, 88.0, 96.0)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines with DF=5") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(65.0, 69.2, 88.0, 96.0), col="grey50", lty=2)

plot_grid(plotncs3,plotncs5)

ncspline33 = lm(dataJepang$mpg ~ ns(dataJepang$horsepower, knots=c(68,90)))
summary(ncspline33)

## 
## Call:
## lm(formula = dataJepang$mpg ~ ns(dataJepang$horsepower, knots = c(68, 
##     90)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2021 -2.8354 -0.5684  2.3362 11.6571 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                     34.058      1.825  18.658
## ns(dataJepang$horsepower, knots = c(68, 90))1  -14.970      1.898  -7.885
## ns(dataJepang$horsepower, knots = c(68, 90))2   -7.386      4.354  -1.696
## ns(dataJepang$horsepower, knots = c(68, 90))3   -7.729      2.859  -2.703
##                                               Pr(>|t|)    
## (Intercept)                                    < 2e-16 ***
## ns(dataJepang$horsepower, knots = c(68, 90))1 1.97e-11 ***
## ns(dataJepang$horsepower, knots = c(68, 90))2   0.0940 .  
## ns(dataJepang$horsepower, knots = c(68, 90))3   0.0085 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.113 on 75 degrees of freedom
## Multiple R-squared:  0.5615, Adjusted R-squared:  0.544 
## F-statistic: 32.02 on 3 and 75 DF,  p-value: 1.975e-13

Perbandingan Model

nilai_rmse3 <- rbind(polinomial3 %>% select(-1) %>% slice_min(RMSE),
                      pemilihanfs.tangga3 %>% select(-1) %>% slice_min(RMSE),
                      pemilihan.ncs3%>%  select(-1)%>% slice_min(RMSE)
                      )

Metode <- c("Regresi Polinomial (d=3)",
                "Piecewise Constant (knots=2)",
                "Natural Cubic Splines (df=3)")

perbandingan3 <- data.frame(Metode,nilai_rmse2)
datatable(perbandingan3)

ggplot(dataJepang,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, aes(col = "Regresi Polynomial"), col = "red",se = F)+  
  stat_smooth(method = "lm", 
              formula = y~cut(x,3), 
              lty = 1, aes(col = "Piecewise Constant"), col = "black",se = F)+
  stat_smooth(method = "lm", 
           formula = y~ns(x, df=3), 
           lty = 1,aes(col = "Natural Cubic Spline"), col = "green", se=F)+
  
  scale_color_manual(values = c("Regresi Polynomial"="red","Piecewise Constant"="black","Natural Cubic Spline"="green"))+theme_bw()

Berdasarkan nilai RMSE dan MAE model terbaik untuk memodelkan data Auto dengan peubah respon mpg dan peubah penjelas horsepower dari kendaraan yang berasal dari Jepang adalah model Regresi polinomial derajat 3.

Visualisasi Model Terbaik

#Data Auto
plotncs6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="purple")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots=c(70.0, 84.0, 93.5, 110.0,150.0)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines df=6") +
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  labs(subtitle = "Data Auto")
  geom_vline(xintercept = c(70.0, 84.0, 93.5, 110.0,150.0), col="grey50", lty=2)

## mapping: xintercept = ~xintercept 
## geom_vline: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

#Data Amerika
ft9<- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="red") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,9), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Piecewise Constant k=8") +
  labs(subtitle = "Data Amerika")+
  xlab("Horse Power") + ylab("mile per gallon")

plotncs5 <- ggplot(dataAmerika,aes(x=horsepower, y=mpg)) +
           geom_point(alpha=0.55, color="red")+
           stat_smooth(method = "lm", 
           formula = y~ns(x, knots = c(72,88,100,140)), 
           lty = 1,col = "black", se=F)+
           theme_bw()+
    ggtitle("Natural Cubic Splines df=5") +
  labs(subtitle = "Data Amerika")+
  xlab("Horse Power") + 
  ylab("mile per gallon") +
  geom_vline(xintercept = c(72,88,100,140), col="grey50", lty=2)

#Data Eropa
regpol1 <- ggplot(dataEropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="green") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,1,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial d=1") +
  labs(subtitle = "Data Eropa")+
  xlab("Horse Power") + 
  ylab("mile per gallon")  

#Data Jepang
regpol3 <- ggplot(dataJepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "black",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial d=3") +
  labs(subtitle = "Data Jepang")+
  xlab("Horse Power") + 
  ylab("mile per gallon")  

plot_grid(plotncs6, ft9, plotncs5, regpol1, regpol3)

Referensi

Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/

James, Gareth. et.al. (2013). An Introduction to Statistical Learning With Aplication in R. New York: Springer.

Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear