library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.5

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.3

## Warning: package 'dplyr' was built under R version 4.0.5

## Warning: package 'stringr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(splines)
library(rsample)

## Warning: package 'rsample' was built under R version 4.0.5

library(mlr3measures)

## Warning: package 'mlr3measures' was built under R version 4.0.5

## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.5

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-2

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.0.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:mlr3measures':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:purrr':
## 
##     lift

library(boot)

## Warning: package 'boot' was built under R version 4.0.3

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

Data

Dataset yang akan digunakan yaitu data Auto yang berasal dari package ISLR. Beberapa variabel data yang terdapat di data Auto yaitu:

mpg : miles per gallon
cylinders : Number of cylinders between 4 and 8
displacement : Engine displacement (cu. inches)
horsepower : Engine horsepower
weight : Vehicle weight (lbs.)
acceleration : Time to accelerate from 0 to 60 mph (sec.)
year : Model year (modulo 100)
origin : Origin of car (1. American, 2. European, 3. Japanese)
name : Vehicle name

Data terdiri dari 392 observasi

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.5

attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Dari data Auto akan dimodelkan mpg vs horsepower, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot diperoleh:

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  theme_bw()+
  ggtitle("scatterplot mpg vs horsepower")

Soal No 1

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya

Regresi polinomial
Piecewice constant
Natural cubic splines
Perbandingan ketiga model

a. Regresi polinomial

Dari data Auto yang akan dimodelkan dimana variabel responnya yaitu mpg (bahan bakar) dan variabel prediktornya yaitu horsepower. Karena hasil dari pemodelan regresi polinomial bergantung dengan derajat polinomialnya. Jika \(d\) yang digunakan semakin besar dan kurvanya menjadi liar maka untuk menjawab bagian a ini akan dicari terlebih dahulu derajat terbaiknya.

set.seed(1)
deltas <- numeric(10)
model <- list()

for (i in 1:10) {
  fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
  deltas[i] <- cv.glm(Auto, fit, K = 10)$delta[1]
  model[[i]] <- fit
}

plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min

## [1] 7

model_best <- model[[d.min]]
summary(model_best)

## 
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -15.5171   -2.5682   -0.2082    2.1757   15.2986  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.446      0.217 108.058  < 2e-16 ***
## poly(horsepower, i)1 -120.138      4.296 -27.966  < 2e-16 ***
## poly(horsepower, i)2   44.090      4.296  10.263  < 2e-16 ***
## poly(horsepower, i)3   -3.949      4.296  -0.919  0.35856    
## poly(horsepower, i)4   -5.188      4.296  -1.208  0.22794    
## poly(horsepower, i)5   13.272      4.296   3.089  0.00215 ** 
## poly(horsepower, i)6   -8.546      4.296  -1.989  0.04737 *  
## poly(horsepower, i)7    7.981      4.296   1.858  0.06397 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18.4548)
## 
##     Null deviance: 23819.0  on 391  degrees of freedom
## Residual deviance:  7086.6  on 384  degrees of freedom
## AIC: 2265.2
## 
## Number of Fisher Scoring iterations: 2

points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)

set.seed(1)

cross_val <- vfold_cv(Auto, v=10, strata = "mpg")
derajat <- 1:10

polynom <- map_dfr(derajat, function(i){
  metric_polynom <- map_dfr(cross_val$splits, function(x){
    mod <- lm(mpg ~ poly(horsepower, derajat=i, raw = T), data=Auto[x$in_id,])
    pred <- predict(mod, newdata=Auto[-x$in_id,])
    truth <- Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse, mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
  )
  metric_polynom
  }
  )
polynom <- cbind(derajat = derajat, polynom)
tibble(polynom)

## # A tibble: 100 x 3
##    derajat  rmse   mae
##      <int> <dbl> <dbl>
##  1       1  5.62  4.39
##  2       2  4.46  3.50
##  3       3  4.75  4.10
##  4       4  4.84  3.74
##  5       5  4.40  3.33
##  6       6  4.64  3.73
##  7       7  6.38  5.11
##  8       8  4.85  3.61
##  9       9  4.48  3.48
## 10      10  4.44  3.45
## # ... with 90 more rows

#berdasarkan rmse
polynom %>% slice_min(rmse)

##   derajat     rmse      mae
## 1       5 3.594957 2.684384

#berdasarkan mae
polynom %>% slice_min(mae)

##   derajat     rmse      mae
## 1       5 3.600731 2.657939

#menghitung rata-rata untuk 10 folds
mean_polynom <- colMeans(polynom)
mean_polynom

##  derajat     rmse      mae 
## 5.500000 4.384655 3.314488

polinomial5 <- ggplot(Auto, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 5, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=5")

polinomial7 <- ggplot(Auto, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 7, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=7")

grid.arrange(polinomial5, polinomial7, ncol=2)

mod_polinomial5 = lm(mpg ~ poly(horsepower,5),data=Auto)
summary(mod_polinomial5)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 5), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.4326  -2.5285  -0.2925   2.1750  15.9730 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2185 107.308  < 2e-16 ***
## poly(horsepower, 5)1 -120.1377     4.3259 -27.772  < 2e-16 ***
## poly(horsepower, 5)2   44.0895     4.3259  10.192  < 2e-16 ***
## poly(horsepower, 5)3   -3.9488     4.3259  -0.913  0.36190    
## poly(horsepower, 5)4   -5.1878     4.3259  -1.199  0.23117    
## poly(horsepower, 5)5   13.2722     4.3259   3.068  0.00231 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.326 on 386 degrees of freedom
## Multiple R-squared:  0.6967, Adjusted R-squared:  0.6928 
## F-statistic: 177.4 on 5 and 386 DF,  p-value: < 2.2e-16

b. Piecewise constant

set.seed(1)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
nbreak <- 2:8 #atau knotsnya 1-7
best_tangga <- map_dfr(nbreak, function(i){
metric_tangga <- map_dfr(cross_val$splits, function(x){
      training <- Auto[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower, data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(
        lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
        upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
      testing <- Auto[-x$in_id,]
      horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
      pred <- predict(mod, newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
      )
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)

best_tangga <- cbind(breaks=nbreak,best_tangga)
# menampilkan hasil all breaks
best_tangga

##   breaks     rmse      mae
## 1      2 6.142665 4.792166
## 2      3 5.757634 4.523240
## 3      4 4.975044 3.786801
## 4      5 4.672002 3.564542
## 5      6 4.610925 3.514587
## 6      7 4.441881 3.330109
## 7      8 4.412751 3.386265

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      8 4.412751 3.386265

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1      7 4.441881 3.330109

karena dari nilai RMSE dan MAE nilai yang terendah terdapat dua breaks yang terbaik yaitu 7 dan 8 maka akan divisualisasikan untuk mengetahui breaks terbaik berdasarkan plot yang dihasilkan:

Piecewise7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant (Knots=6)")

Piecewise8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant (Knots=7)")

grid.arrange(Piecewise7, Piecewise8, ncol=2)

Dari plot diperoleh bahwa dengan breks=8 lebih mendekati data sehingga dipilih breaks = 8.

Hasil pemodelan:

mod_tangga8 = lm(mpg ~ cut(horsepower,8),data=Auto)
summary(mod_tangga8) #knots = 7

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9471  -2.6757  -0.1533   2.4015  14.5529 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  33.9085     0.5771   58.76   <2e-16 ***
## cut(horsepower, 8)(69,92]    -6.9614     0.6910  -10.07   <2e-16 ***
## cut(horsepower, 8)(92,115]  -12.7551     0.7425  -17.18   <2e-16 ***
## cut(horsepower, 8)(115,138] -15.6656     1.1264  -13.91   <2e-16 ***
## cut(horsepower, 8)(138,161] -18.7799     0.8568  -21.92   <2e-16 ***
## cut(horsepower, 8)(161,184] -19.9735     1.1470  -17.41   <2e-16 ***
## cut(horsepower, 8)(184,207] -21.1228     1.7720  -11.92   <2e-16 ***
## cut(horsepower, 8)(207,230] -21.0085     1.5159  -13.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.433 on 384 degrees of freedom
## Multiple R-squared:  0.6832, Adjusted R-squared:  0.6774 
## F-statistic: 118.3 on 7 and 384 DF,  p-value: < 2.2e-16

c. Natural cubic splines

Fungsi spline kubik diketahui bergantung pada knot, tetapi kita tahu jika knots yang dipilih terlalu banyak akan menghasilkan model yang overfit. Sehingga, dipilih knot dari 1-7 untuk dicari knot terbaik yang sesuai dengan data yang akan dimodelkan.

set.seed(1)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:8 #knots 1-7

best_spline <- map_dfr(df, function(i){
  metric_spline <- map_dfr(cross_val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
      pred <- predict(mod, newdata=Auto[-x$in_id,])
      truth <- Auto[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth, response = pred)
      mae <- mlr3measures::mae(truth = truth, response = pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
)
  metric_spline
}
)
best_spline <- cbind(df=df, best_spline)
tibble(best_spline)

## # A tibble: 70 x 3
##       df  rmse   mae
##    <int> <dbl> <dbl>
##  1     2  4.73  3.44
##  2     3  4.45  3.35
##  3     4  4.12  3.38
##  4     5  4.17  3.03
##  5     6  3.76  2.83
##  6     7  4.13  3.18
##  7     8  5.80  4.32
##  8     2  3.94  2.78
##  9     3  3.79  2.81
## 10     4  4.34  3.42
## # ... with 60 more rows

Dari perhitungan berdasarkan nilai RMSE dan MAE terkecil diperoleh df=3

#berdasarkan rmse
best_spline %>% slice_min(rmse)

##   df     rmse      mae
## 1  3 3.522074 2.614708

#berdasarkan mae
best_spline %>% slice_min(mae)

##   df     rmse      mae
## 1  3 3.522074 2.614708

menghitung rata-rata untuk 10 folds

mean_best_spline <- colMeans(best_spline)
mean_best_spline

##       df     rmse      mae 
## 5.000000 4.311911 3.242990

dim(ns(Auto$horsepower, knots = c(100, 200)))

## [1] 392   3

atau bisa juga dengan fungsi df

dim(ns(Auto$horsepower, df=3))

## [1] 392   3

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=3),"knots")

## 33.33333% 66.66667% 
##        84       110

ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 97, 134, 171, 208)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("Natural cubic spline df=3 (knots = 100, 200)")

mod_spline3 = lm(mpg ~ ns(horsepower, knots = c(100, 200)), data=Auto)
summary(mod_spline3)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(100, 200)), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8663  -2.4810  -0.1824   2.2352  15.9235 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           38.2682     0.7899  48.446  < 2e-16 ***
## ns(horsepower, knots = c(100, 200))1 -21.6528     1.2771 -16.955  < 2e-16 ***
## ns(horsepower, knots = c(100, 200))2 -43.5820     2.5991 -16.768  < 2e-16 ***
## ns(horsepower, knots = c(100, 200))3 -10.6246     1.6747  -6.344 6.24e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.359 on 388 degrees of freedom
## Multiple R-squared:  0.6905, Adjusted R-squared:  0.6881 
## F-statistic: 288.5 on 3 and 388 DF,  p-value: < 2.2e-16

d. Perbandingan ketiga model

nilai_rmse <- rbind(3.594957,
                    4.412751,
                    3.522074
                     )
nilai_mae <- rbind(2.657939,
                   3.386265,
                   2.614708
                     )
nama_model <- c("Regresi polinomial (d=5)",
               "Piecewise constant (breaks=8)",
               "Natural cubic spline(knots=100, 200)"
)

tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))

## # A tibble: 3 x 3
##   nama_model                           nilai_rmse nilai_mae
##   <chr>                                     <dbl>     <dbl>
## 1 Regresi polinomial (d=5)                   3.59      2.66
## 2 Piecewise constant (breaks=8)              4.41      3.39
## 3 Natural cubic spline(knots=100, 200)       3.52      2.61

polinomial5 <- ggplot(Auto, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 5, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regpol (d=5)")

Piecewise8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("FT (Knots=7)")

ncs3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 97, 134, 171, 208)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=3)")

grid.arrange(polinomial5, Piecewise8, ncs3, ncol=3)

Jika berdasarkan nilai RMSE dan MAE yang terkecil maka model terbaik yang diperoleh adalah Natural Cubic Spline dengan df=3 dimana kurvanya lebih mulus.

Soal No 2

Lakukan hal di atas untuk masing-masing sub-set data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang). Plot model terbaik dalam satu frame. Berikan ulasan terhadap hasil Anda.

a. Amerika

Auto_US <- Auto[Auto$origin == 1, ] %>% select(mpg, horsepower, origin)
tibble(Auto_US)

## # A tibble: 245 x 3
##      mpg horsepower origin
##    <dbl>      <dbl>  <dbl>
##  1    18        130      1
##  2    15        165      1
##  3    18        150      1
##  4    16        150      1
##  5    17        140      1
##  6    15        198      1
##  7    14        220      1
##  8    14        215      1
##  9    14        225      1
## 10    15        190      1
## # ... with 235 more rows

Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Amerika (Auto_US) saja atau sebanyak 245 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:

ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") + 
                 theme_bw()+
ggtitle("mpg vs horsepower, Amerika")

a.1 Regresi polinomial

set.seed(1)
deltas <- numeric(10)
model <- list()

for (i in 1:10) {
  fit <- glm(mpg ~ poly(horsepower, i), data = Auto_US)
  deltas[i] <- cv.glm(Auto_US, fit, K = 10)$delta[1]
  model[[i]] <- fit
}

plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min

## [1] 7

model_best <- model[[d.min]]
summary(model_best)

## 
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_US)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -14.2229   -2.1009   -0.3205    2.0260   13.3482  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.0335     0.2426  82.568  < 2e-16 ***
## poly(horsepower, i)1 -75.6095     3.7977 -19.909  < 2e-16 ***
## poly(horsepower, i)2  29.4684     3.7977   7.759 2.53e-13 ***
## poly(horsepower, i)3  -5.8086     3.7977  -1.529    0.127    
## poly(horsepower, i)4   1.8002     3.7977   0.474    0.636    
## poly(horsepower, i)5   6.0818     3.7977   1.601    0.111    
## poly(horsepower, i)6  -3.0887     3.7977  -0.813    0.417    
## poly(horsepower, i)7   5.8186     3.7977   1.532    0.127    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 14.42288)
## 
##     Null deviance: 10120.8  on 244  degrees of freedom
## Residual deviance:  3418.2  on 237  degrees of freedom
## AIC: 1359
## 
## Number of Fisher Scoring iterations: 2

points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)

set.seed(1)

cross_val <- vfold_cv(Auto_US, v=10, strata = "mpg")
derajat <- 1:10

polynomUS <- map_dfr(derajat, function(i){
  metric_polynomUS <- map_dfr(cross_val$splits, function(x){
    mod <- lm(mpg ~ poly(horsepower, derajat=i), data=Auto_US[x$in_id,])
    pred <- predict(mod, newdata=Auto_US[-x$in_id,])
    truth <- Auto_US[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse, mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
  )
  metric_polynomUS
  }
  )
polynomUS <- cbind(derajat = derajat, polynomUS)
tibble(polynomUS)

## # A tibble: 100 x 3
##    derajat  rmse   mae
##      <int> <dbl> <dbl>
##  1       1  4.39  3.49
##  2       2  3.25  2.90
##  3       3  3.76  3.14
##  4       4  3.95  3.32
##  5       5  4.25  2.97
##  6       6  3.92  2.90
##  7       7  5.17  3.76
##  8       8  4.16  2.99
##  9       9  4.43  3.70
## 10      10  5.21  4.36
## # ... with 90 more rows

#berdasarkan rmse
polynomUS %>% slice_min(rmse)

##   derajat     rmse      mae
## 1       3 2.667124 2.237141

#berdasarkan mae
polynomUS %>% slice_min(mae)

##   derajat    rmse      mae
## 1       6 2.67477 2.071569

#menghitung rata-rata untuk 10 folds
mean_polynomUS <- colMeans(polynomUS)
mean_polynomUS

##  derajat     rmse      mae 
## 5.500000 4.268948 3.074561

polinomialUS3 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 3), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol (d=3)")

polinomialUS6 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 6), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol (d=6)")

polinomialUS7 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 7), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol (d=7)")

grid.arrange(polinomialUS3, polinomialUS6, polinomialUS7, ncol=3)

mod_polinomialUS3 = lm(mpg ~ poly(horsepower,3),data=Auto_US)
summary(mod_polinomialUS3)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto_US)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3092  -2.3199  -0.2081   2.0794  13.2928 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.0335     0.2435  82.262  < 2e-16 ***
## poly(horsepower, 3)1 -75.6095     3.8119 -19.835  < 2e-16 ***
## poly(horsepower, 3)2  29.4684     3.8119   7.731 2.89e-13 ***
## poly(horsepower, 3)3  -5.8086     3.8119  -1.524    0.129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.812 on 241 degrees of freedom
## Multiple R-squared:  0.654,  Adjusted R-squared:  0.6497 
## F-statistic: 151.8 on 3 and 241 DF,  p-value: < 2.2e-16

a.2 Piecewice constant

set.seed(1)
cross_val <- vfold_cv(Auto_US,v=10,strata = "mpg")
breaks <- 2:8 #atau knotsnya 1-7
best_tanggaUS <- map_dfr(breaks, function(i){
metric_tanggaUS <- map_dfr(cross_val$splits, function(x){
      training <- Auto_US[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower, data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(
        lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
        upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
      testing <- Auto_US[-x$in_id,]
      horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
      pred <- predict(mod, newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
      )
metric_tanggaUS
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaUS <- colMeans(metric_tanggaUS)
mean_metric_tanggaUS
}
)

best_tanggaUS <- cbind(breaks=breaks,best_tanggaUS)
# menampilkan hasil all breaks
tibble(best_tanggaUS)

## # A tibble: 7 x 3
##   breaks  rmse   mae
##    <int> <dbl> <dbl>
## 1      2  5.05  3.87
## 2      3  4.78  3.69
## 3      4  4.14  3.11
## 4      5  4.05  3.16
## 5      6  4.27  3.27
## 6      7  4.17  3.18
## 7      8  3.95  2.98

#berdasarkan rmse
best_tanggaUS %>% slice_min(rmse)

##   breaks     rmse     mae
## 1      8 3.954823 2.97661

#berdasarkan mae
best_tanggaUS %>% slice_min(mae)

##   breaks     rmse     mae
## 1      8 3.954823 2.97661

karena dari nilai RMSE dan MAE nilai yang terkecil terdapat break yang terbaik yaitu 8 maka:

ggplot(Auto_US ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant Knots=7")

mod_tangga8 = lm(mpg ~ cut(horsepower,8),data=Auto_US)
summary(mod_tangga8)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = Auto_US)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.420  -1.900   0.065   2.159  13.159 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    29.4200     0.8729  33.706  < 2e-16 ***
## cut(horsepower, 8)(74.2,96.5]  -4.5787     0.9824  -4.661 5.26e-06 ***
## cut(horsepower, 8)(96.5,119]   -9.6780     1.0328  -9.371  < 2e-16 ***
## cut(horsepower, 8)(119,141]   -12.7330     1.1935 -10.669  < 2e-16 ***
## cut(horsepower, 8)(141,163]   -14.7050     1.0690 -13.756  < 2e-16 ***
## cut(horsepower, 8)(163,186]   -15.4850     1.2344 -12.545  < 2e-16 ***
## cut(horsepower, 8)(186,208]   -16.6343     1.7143  -9.704  < 2e-16 ***
## cut(horsepower, 8)(208,230]   -16.5200     1.5118 -10.927  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.904 on 237 degrees of freedom
## Multiple R-squared:  0.6432, Adjusted R-squared:  0.6326 
## F-statistic: 61.03 on 7 and 237 DF,  p-value: < 2.2e-16

a.3 Natural cubic splines

set.seed(1)
cross_val <- vfold_cv(Auto_US,v=10,strata = "mpg")

df <- 2:8 #knots 1-7

best_splineUS <- map_dfr(df, function(i){
  metric_splineUS <- map_dfr(cross_val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_US[x$in_id,])
      pred <- predict(mod, newdata=Auto_US[-x$in_id,])
      truth <- Auto_US[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth, response = pred)
      mae <- mlr3measures::mae(truth = truth, response = pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
)
  metric_splineUS
}
)
best_splineUS <- cbind(df=df, best_splineUS)
tibble(best_splineUS)

## # A tibble: 70 x 3
##       df  rmse   mae
##    <int> <dbl> <dbl>
##  1     2  4.45  3.70
##  2     3  2.92  2.53
##  3     4  2.82  2.37
##  4     5  3.49  2.83
##  5     6  4.00  2.53
##  6     7  2.90  2.21
##  7     8  4.72  3.34
##  8     2  3.83  2.63
##  9     3  3.57  2.66
## 10     4  5.07  4.21
## # ... with 60 more rows

#berdasarkan rmse
best_splineUS %>% slice_min(rmse)

##   df     rmse      mae
## 1  8 2.373059 1.974183

#berdasarkan mae
best_splineUS %>% slice_min(mae)

##   df     rmse      mae
## 1  5 2.373138 1.937187

menghitung rata-rata untuk 10 folds

mean_best_splineUS <- colMeans(best_splineUS)
mean_best_splineUS

##       df     rmse      mae 
## 5.000000 3.800996 2.894464

dim(ns(Auto_US$horsepower, knots = c(60, 105, 150, 195)))

## [1] 245   5

dim(ns(Auto_US$horsepower, knots = c(60, 85, 110, 135, 160, 185, 210)))

## [1] 245   8

NCSUS5 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 105, 150, 195)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=5)")

NCSUS8 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 85, 110, 135, 160, 185, 210)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=8)")

grid.arrange(NCSUS5, NCSUS8, ncol=2)

mod_spline5 = lm(mpg ~ ns(horsepower, knots = c(60, 105, 150, 195)),data=Auto_US)
summary(mod_spline5) #knots = 5

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 105, 150, 195)), 
##     data = Auto_US)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6161  -2.2149  -0.3081   2.0518  13.0756 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                     33.438      2.467  13.556
## ns(horsepower, knots = c(60, 105, 150, 195))1  -15.866      2.232  -7.108
## ns(horsepower, knots = c(60, 105, 150, 195))2  -17.755      3.009  -5.901
## ns(horsepower, knots = c(60, 105, 150, 195))3  -21.007      2.296  -9.151
## ns(horsepower, knots = c(60, 105, 150, 195))4  -23.779      5.358  -4.438
## ns(horsepower, knots = c(60, 105, 150, 195))5  -17.260      1.955  -8.828
##                                               Pr(>|t|)    
## (Intercept)                                    < 2e-16 ***
## ns(horsepower, knots = c(60, 105, 150, 195))1 1.35e-11 ***
## ns(horsepower, knots = c(60, 105, 150, 195))2 1.23e-08 ***
## ns(horsepower, knots = c(60, 105, 150, 195))3  < 2e-16 ***
## ns(horsepower, knots = c(60, 105, 150, 195))4 1.39e-05 ***
## ns(horsepower, knots = c(60, 105, 150, 195))5 2.32e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.812 on 239 degrees of freedom
## Multiple R-squared:  0.6569, Adjusted R-squared:  0.6497 
## F-statistic: 91.53 on 5 and 239 DF,  p-value: < 2.2e-16

a.4 Perbandingan ketiga model

nilai_rmse <- rbind(2.667124,
                    3.954823,
                    2.373138
                     )
nilai_mae <- rbind(2.237141,
                   2.97661,
                   1.937187
                     )
nama_model <- c("Regresi polinomial 'US' (d=5)",
               "Piecewise constant 'US' (breaks=8)",
               "Natural cubic spline 'US' (knots=60, 105, 150, 195)"
)

tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))

## # A tibble: 3 x 3
##   nama_model                                          nilai_rmse nilai_mae
##   <chr>                                                    <dbl>     <dbl>
## 1 Regresi polinomial 'US' (d=5)                             2.67      2.24
## 2 Piecewise constant 'US' (breaks=8)                        3.95      2.98
## 3 Natural cubic spline 'US' (knots=60, 105, 150, 195)       2.37      1.94

polinomialUS3 <- ggplot(Auto_US, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 3), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol 'US' (d=3)")

PiecewiseUS8 <- ggplot(Auto_US ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,8), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("FT 'US' Knots=7")

NCSUS5 <- ggplot(Auto_US,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 105, 150, 195)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS 'US' (df=5)")

grid.arrange(polinomialUS3, PiecewiseUS8, NCSUS5, ncol=3)

Berdasarkan nilai RMSE dan MAE terkecil diperoleh bahwa model yang terbaik yaitu Natural Cubic Spline df=5.

b. European

Auto_EU <- Auto[Auto$origin == 2, ] %>% select(mpg, horsepower, origin)
tibble(Auto_EU)

## # A tibble: 68 x 3
##      mpg horsepower origin
##    <dbl>      <dbl>  <dbl>
##  1    26         46      2
##  2    25         87      2
##  3    24         90      2
##  4    25         95      2
##  5    26        113      2
##  6    28         90      2
##  7    30         70      2
##  8    30         76      2
##  9    27         60      2
## 10    23         54      2
## # ... with 58 more rows

Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Eropa (Auto_EU) saja atau sebanyak 68 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:

ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") + 
                 theme_bw()+
ggtitle("mpg vs horsepower, Eropa")

b.1 Regresi polinomial

set.seed(1)
deltas <- numeric(10)
model <- list()

# metode polinomial
for (i in 1:10) {
  fit <- glm(mpg ~ poly(horsepower, i), data = Auto_EU)
  deltas[i] <- cv.glm(Auto_EU, fit, K = 10)$delta[1]
  model[[i]] <- fit
}

plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min

## [1] 1

model_best <- model[[d.min]]
summary(model_best)

## 
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_EU)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -10.4946   -2.8387   -0.4171    2.2148   12.8858  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          27.6029     0.5898  46.800  < 2e-16 ***
## poly(horsepower, i) -36.6027     4.8637  -7.526 1.87e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 23.65553)
## 
##     Null deviance: 2901.0  on 67  degrees of freedom
## Residual deviance: 1561.3  on 66  degrees of freedom
## AIC: 412.07
## 
## Number of Fisher Scoring iterations: 2

points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)

set.seed(1)

cross_val <- vfold_cv(Auto_EU, v=10, strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

derajat <- 1:10

polynomEU <- map_dfr(derajat, function(i){
  metric_polynomEU <- map_dfr(cross_val$splits, function(x){
    mod <- lm(mpg ~ poly(horsepower, derajat=i), data=Auto_EU[x$in_id,])
    pred <- predict(mod, newdata=Auto_EU[-x$in_id,])
    truth <- Auto_EU[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse, mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
  )
  metric_polynomEU
  }
  )
polynomEU <- cbind(derajat = derajat, polynomEU)
tibble(polynomEU)

## # A tibble: 100 x 3
##    derajat  rmse   mae
##      <int> <dbl> <dbl>
##  1       1  7.54  6.35
##  2       2  4.21  3.38
##  3       3  4.93  4.09
##  4       4  2.85  2.56
##  5       5  5.45  4.18
##  6       6  4.55  3.85
##  7       7  2.49  1.88
##  8       8  5.21  3.79
##  9       9  2.33  2.11
## 10      10  6.26  5.08
## # ... with 90 more rows

#berdasarkan rmse
polynomEU %>% slice_min(rmse)

##   derajat     rmse      mae
## 1       9 1.998743 1.641463

#berdasarkan mae
polynomEU %>% slice_min(mae)

##   derajat     rmse      mae
## 1       9 1.998743 1.641463

#menghitung rata-rata untuk 10 folds
mean_polynomEU <- colMeans(polynomEU)
mean_polynomEU

##  derajat     rmse      mae 
## 5.500000 6.507944 4.678497

polinomialEU1 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 1, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=1")

polinomialEU9 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 9, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=9")

grid.arrange(polinomialEU1, polinomialEU9, ncol=2)

mod_polinomialEU1 = lm(mpg ~ poly(horsepower,1),data=Auto_EU)
summary(mod_polinomialEU1)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 1), data = Auto_EU)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4946  -2.8387  -0.4171   2.2148  12.8858 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          27.6029     0.5898  46.800  < 2e-16 ***
## poly(horsepower, 1) -36.6027     4.8637  -7.526 1.87e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.864 on 66 degrees of freedom
## Multiple R-squared:  0.4618, Adjusted R-squared:  0.4537 
## F-statistic: 56.64 on 1 and 66 DF,  p-value: 1.869e-10

b.2 Piecewice constant

set.seed(1)
cross_val <- vfold_cv(Auto_EU ,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

breaks <- 2:8 #atau knotsnya 1-7
best_tanggaEU <- map_dfr(breaks, function(i){
metric_tanggaEU <- map_dfr(cross_val$splits, function(x){
      training <- Auto_EU[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower, data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(
        lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
        upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
      testing <- Auto_EU[-x$in_id,]
      horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
      pred <- predict(mod, newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
      )
metric_tanggaEU
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaEU <- colMeans(metric_tanggaEU)
mean_metric_tanggaEU
}
)

best_tanggaEU <- cbind(breaks=breaks,best_tanggaEU)
# menampilkan hasil all breaks
best_tanggaEU

##   breaks     rmse      mae
## 1      2 5.216476 4.164261
## 2      3 5.247791 4.313735
## 3      4 4.980629 4.177602
## 4      5 4.733983 3.847658
## 5      6 4.890634 3.975050
## 6      7 4.676543 3.812823
## 7      8 4.896469 4.024421

#berdasarkan rmse
best_tanggaEU %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      7 4.676543 3.812823

#berdasarkan mae
best_tanggaEU %>% slice_min(mae)

##   breaks     rmse      mae
## 1      7 4.676543 3.812823

karena dari nilai RMSE dan MAE nilai yang terendah terdapat break yang terbaik yaitu 7 maka:

ggplot(Auto_EU ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant Knots=6")

mod_tangga7 = lm(mpg ~ cut(horsepower,7),data=Auto_EU)
summary(mod_tangga7)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7), data = Auto_EU)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9778  -3.2148  -0.1364   2.5255  12.5174 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     34.978      1.661  21.061  < 2e-16 ***
## cut(horsepower, 7)(58.4,70.9]   -4.841      2.239  -2.162  0.03456 *  
## cut(horsepower, 7)(70.9,83.3]   -5.995      1.959  -3.060  0.00328 ** 
## cut(horsepower, 7)(83.3,95.7]  -10.228      2.197  -4.655 1.79e-05 ***
## cut(horsepower, 7)(95.7,108]   -14.211      3.322  -4.278 6.75e-05 ***
## cut(horsepower, 7)(108,121]    -13.528      2.421  -5.588 5.69e-07 ***
## cut(horsepower, 7)(121,133]    -18.378      3.895  -4.718 1.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.982 on 61 degrees of freedom
## Multiple R-squared:  0.478,  Adjusted R-squared:  0.4267 
## F-statistic:  9.31 on 6 and 61 DF,  p-value: 3.066e-07

b.3 Natural cubic splines

set.seed(1)
cross_val <- vfold_cv(Auto_EU,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

df <- 2:8 #knots 1-7

best_splineEU <- map_dfr(df, function(i){
  metric_splineEU <- map_dfr(cross_val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_EU[x$in_id,])
      pred <- predict(mod, newdata=Auto_EU[-x$in_id,])
      truth <- Auto_EU[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth, response = pred)
      mae <- mlr3measures::mae(truth = truth, response = pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
)
  metric_splineEU
}
)
best_splineEU <- cbind(df=df, best_splineEU)
tibble(best_splineEU)

## # A tibble: 70 x 3
##       df  rmse   mae
##    <int> <dbl> <dbl>
##  1     2  7.54  6.34
##  2     3  4.58  3.55
##  3     4  4.96  4.11
##  4     5  2.85  2.56
##  5     6  5.46  4.18
##  6     7  4.58  3.91
##  7     8  2.47  1.87
##  8     2  5.29  3.79
##  9     3  2.31  2.12
## 10     4  6.67  5.36
## # ... with 60 more rows

#berdasarkan rmse
best_splineEU %>% slice_min(rmse)

##   df     rmse      mae
## 1  7 2.160783 1.958299

#berdasarkan mae
best_splineEU %>% slice_min(mae)

##   df     rmse      mae
## 1  8 2.470629 1.865665

menghitung rata-rata untuk 10 folds

mean_best_splineEU <- colMeans(best_splineEU)
mean_best_splineEU

##       df     rmse      mae 
## 5.000000 4.831375 3.899562

dim(ns(Auto_EU$horsepower, knots = c(60, 65, 70, 75, 80, 85)))

## [1] 68  7

dim(ns(Auto_EU$horsepower, knots = c(60, 65, 70, 75, 80, 85, 90)))

## [1] 68  8

splineEU7 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=7)")

splineEU8 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85, 90)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=8)")

grid.arrange(splineEU7, splineEU8, ncol=2)

mod_spline7 = lm(mpg ~ ns(horsepower, knots = c(60, 65, 70, 75, 80, 85)),data=Auto_EU)
summary(mod_spline7)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 65, 70, 75, 80, 
##     85)), data = Auto_EU)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1859  -2.8266  -0.6488   2.0932  12.3173 
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                          35.019      2.582  13.562
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))1   -6.072      5.219  -1.164
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))2   -4.586      4.540  -1.010
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))3   -5.289      3.828  -1.382
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))4   -7.422      3.640  -2.039
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))5  -12.595      4.183  -3.011
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))6  -15.858      8.277  -1.916
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))7  -17.319      4.220  -4.104
##                                                    Pr(>|t|)    
## (Intercept)                                         < 2e-16 ***
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))1 0.249230    
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))2 0.316514    
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))3 0.172190    
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))4 0.045877 *  
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))5 0.003807 ** 
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))6 0.060164 .  
## ns(horsepower, knots = c(60, 65, 70, 75, 80, 85))7 0.000125 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.072 on 60 degrees of freedom
## Multiple R-squared:  0.4679, Adjusted R-squared:  0.4058 
## F-statistic: 7.537 on 7 and 60 DF,  p-value: 1.691e-06

b.4 Perbandingan ketiga model

nilai_rmse <- rbind(7.536994,
                    4.676543,
                    2.160783
                     )
nilai_mae <- rbind(6.339356,
                   3.812823,
                   1.958299
                     )
nama_model <- c("Regresi polinomial 'EU' (d=5)",
               "Piecewise constant 'EU' (breaks=8)",
               "Natural cubic spline 'EU' (knots=60, 105, 150, 195)"
)

tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))

## # A tibble: 3 x 3
##   nama_model                                          nilai_rmse nilai_mae
##   <chr>                                                    <dbl>     <dbl>
## 1 Regresi polinomial 'EU' (d=5)                             7.54      6.34
## 2 Piecewise constant 'EU' (breaks=8)                        4.68      3.81
## 3 Natural cubic spline 'EU' (knots=60, 105, 150, 195)       2.16      1.96

polinomialEU1 <- ggplot(Auto_EU, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 1, raw=T), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol 'EU' (d=1)")

PiecewiseEU7 <- ggplot(Auto_EU ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("FT 'EU' Knots=6")

splineEU7 <- ggplot(Auto_EU,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS'EU'  (df=7)")

grid.arrange(polinomialEU1, PiecewiseEU7, splineEU7, ncol=3)

Berdasarkan nilai RMSE dan MAE terkecil diperoleh bahwa model yang terbaik yaitu Piecewise Constant breaks 7.

c. Japan

Auto_J <- Auto[Auto$origin == 3, ] %>% select(mpg, horsepower, origin)
tibble(Auto_J)

## # A tibble: 79 x 3
##      mpg horsepower origin
##    <dbl>      <dbl>  <dbl>
##  1    24         95      3
##  2    27         88      3
##  3    27         88      3
##  4    25         95      3
##  5    31         65      3
##  6    35         69      3
##  7    24         95      3
##  8    19         97      3
##  9    28         92      3
## 10    23         97      3
## # ... with 69 more rows

Dari data Auto akan dimodelkan mpg vs horsepower yang berasal dari Jepang (Auto_J) saja atau sebanyak 79 observasi, dimana dua variabel ini jika digambarkan dalam bentuk scatterplot yaitu:

ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") + 
                 theme_bw()+
ggtitle("mpg vs horsepower, Jepang")

c.1 Regresi polinomial

set.seed(1)
deltas <- numeric(10)
model <- list()

# metode polinomial
for (i in 1:10) {
  fit <- glm(mpg ~ poly(horsepower, i), data = Auto_J)
  deltas[i] <- cv.glm(Auto_J, fit, K = 10)$delta[1]
  model[[i]] <- fit
}

plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
d.min

## [1] 5

model_best <- model[[d.min]]
summary(model_best)

## 
## Call:
## glm(formula = mpg ~ poly(horsepower, i), data = Auto_J)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.3880  -2.6375  -0.3287   1.8352  11.0450  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            30.451      0.450  67.667  < 2e-16 ***
## poly(horsepower, i)1  -36.203      4.000  -9.051 1.49e-13 ***
## poly(horsepower, i)2    9.197      4.000   2.299   0.0244 *  
## poly(horsepower, i)3   16.699      4.000   4.175 8.14e-05 ***
## poly(horsepower, i)4   -1.574      4.000  -0.394   0.6951    
## poly(horsepower, i)5    6.963      4.000   1.741   0.0859 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 15.99807)
## 
##     Null deviance: 2892.9  on 78  degrees of freedom
## Residual deviance: 1167.9  on 73  degrees of freedom
## AIC: 450.98
## 
## Number of Fisher Scoring iterations: 2

points(which.min(deltas), deltas[which.min(deltas)], col = "red", cex = 2, pch = 20)

set.seed(1)

cross_val <- vfold_cv(Auto_J, v=10, strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

derajat <- 1:10

polynomJ <- map_dfr(derajat, function(i){
  metric_polynomJ <- map_dfr(cross_val$splits, function(x){
    mod <- lm(mpg ~ poly(horsepower, derajat=i, raw = T), data=Auto_J[x$in_id,])
    pred <- predict(mod, newdata=Auto_J[-x$in_id,])
    truth <- Auto_J[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth, response = pred)
    mae <- mlr3measures::mae(truth = truth, response = pred)
    metric <- c(rmse, mae)
    names(metric) <- c("rmse","mae")
    return(metric)
    }
  )
  metric_polynomJ
  }
  )

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

## Warning in predict.lm(mod, newdata = Auto_J[-x$in_id, ]): prediction from a
## rank-deficient fit may be misleading

polynomJ <- cbind(derajat = derajat, polynomJ)
tibble(polynomJ)

## # A tibble: 100 x 3
##    derajat  rmse   mae
##      <int> <dbl> <dbl>
##  1       1  4.26  3.93
##  2       2  5.48  4.61
##  3       3  3.60  3.03
##  4       4  3.11  2.71
##  5       5  3.26  2.38
##  6       6  2.73  2.11
##  7       7  5.15  3.14
##  8       8  7.72  5.56
##  9       9  4.99  4.01
## 10      10  5.90  4.70
## # ... with 90 more rows

#berdasarkan rmse
polynomJ %>% slice_min(rmse)

##   derajat     rmse      mae
## 1       4 2.427555 2.023453

#berdasarkan mae
polynomJ %>% slice_min(mae)

##   derajat     rmse      mae
## 1       4 2.435324 1.828376

#menghitung rata-rata untuk 10 folds
mean_polynomJ <- colMeans(polynomJ)
mean_polynomJ

##  derajat     rmse      mae 
## 5.500000 9.652879 5.487174

polinomialJ4 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=4")

polinomialJ5 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 5), lty = 1, col = "blue",se = F) +
  theme_bw()+
  ggtitle("Regresi polinomial d=5")

grid.arrange(polinomialJ4, polinomialJ5, ncol=2)

mod_polinomialJ4 = lm(mpg ~ poly(horsepower,4),data=Auto_J)
summary(mod_polinomialJ4)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 4), data = Auto_J)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6872 -2.5927 -0.4482  2.1121 11.5842 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           30.4506     0.4561  66.757  < 2e-16 ***
## poly(horsepower, 4)1 -36.2030     4.0543  -8.930 2.26e-13 ***
## poly(horsepower, 4)2   9.1966     4.0543   2.268   0.0262 *  
## poly(horsepower, 4)3  16.6991     4.0543   4.119 9.81e-05 ***
## poly(horsepower, 4)4  -1.5741     4.0543  -0.388   0.6989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.054 on 74 degrees of freedom
## Multiple R-squared:  0.5795, Adjusted R-squared:  0.5568 
## F-statistic:  25.5 on 4 and 74 DF,  p-value: 2.684e-13

c.2 Piecewice constant

set.seed(1)
cross_val <- vfold_cv(Auto_J ,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

breaks <- 2:5 #atau knotsnya 1-4
best_tanggaJ <- map_dfr(breaks, function(i){
metric_tanggaJ <- map_dfr(cross_val$splits, function(x){
      training <- Auto_J[x$in_id,]
      training$horsepower <- cut(training$horsepower,i)
      mod <- lm(mpg ~ horsepower, data=training)
      labs_x <- levels(mod$model[,2])
      labs_x_breaks <- cbind(
        lower = as.numeric(sub("\\((.+),.*", "\\1", labs_x)),
        upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
      testing <- Auto_J[-x$in_id,]
      horsepower_new <- cut(testing$horsepower, c(labs_x_breaks[1,1], labs_x_breaks[,2]))
      pred <- predict(mod, newdata=list(horsepower=horsepower_new))
      truth <- testing$mpg
      data_eval <- na.omit(data.frame(truth,pred))
      rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
      mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
      )
metric_tanggaJ
# menghitung rata-rata untuk 10 folds
mean_metric_tanggaJ <- colMeans(metric_tanggaJ)
mean_metric_tanggaJ
}
)

best_tanggaJ <- cbind(breaks=breaks,best_tanggaJ)
# menampilkan hasil all breaks
best_tanggaJ

##   breaks     rmse      mae
## 1      2 4.367104 3.284685
## 2      3 4.119029 3.335941
## 3      4 4.197867 3.273378
## 4      5 4.203681 3.339308

#berdasarkan rmse
best_tanggaJ %>% slice_min(rmse)

##   breaks     rmse      mae
## 1      3 4.119029 3.335941

#berdasarkan mae
best_tanggaJ %>% slice_min(mae)

##   breaks     rmse      mae
## 1      4 4.197867 3.273378

karena dari nilai RMSE dan MAE nilai yang terecil terdapat break yang terbaik yaitu 3 dan 4 maka:

PiecewiseJ3 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,3), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant Knots=2")

PiecewiseJ4 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,4), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("Piecewise constant Knots=3")

grid.arrange(PiecewiseJ3, PiecewiseJ4, ncol=2)

mod_tangga4 = lm(mpg ~ cut(horsepower,4),data=Auto_J)
summary(mod_tangga4)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 4), data = Auto_J)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.444  -2.644  -0.590   2.579  11.803 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  34.7973     0.6901  50.425  < 2e-16 ***
## cut(horsepower, 4)(72,92]    -5.3529     1.2063  -4.438 3.07e-05 ***
## cut(horsepower, 4)(92,112]  -10.5073     1.1650  -9.019 1.37e-13 ***
## cut(horsepower, 4)(112,132]  -9.2223     2.2093  -4.174 7.97e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.198 on 75 degrees of freedom
## Multiple R-squared:  0.5432, Adjusted R-squared:  0.5249 
## F-statistic: 29.73 on 3 and 75 DF,  p-value: 9.028e-13

c.3 Natural cubic splines

set.seed(1)
cross_val <- vfold_cv(Auto_J,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the recommended
## threshold of 20. Stratification will be done with 3 breaks instead.

df <- 2:8 # knots 1-7

best_splineJ <- map_dfr(df, function(i){
  metric_splineJ <- map_dfr(cross_val$splits,
    function(x){
      mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto_J[x$in_id,])
      pred <- predict(mod, newdata=Auto_J[-x$in_id,])
      truth <- Auto_J[-x$in_id,]$mpg
      rmse <- mlr3measures::rmse(truth = truth, response = pred)
      mae <- mlr3measures::mae(truth = truth, response = pred)
      metric <- c(rmse,mae)
      names(metric) <- c("rmse","mae")
      return(metric)
      }
)
  metric_splineJ
}
)
best_splineJ <- cbind(df=df, best_splineJ)
tibble(best_splineJ)

## # A tibble: 70 x 3
##       df  rmse   mae
##    <int> <dbl> <dbl>
##  1     2  4.08  3.67
##  2     3  5.49  4.58
##  3     4  3.96  3.07
##  4     5  3.17  2.69
##  5     6  3.25  2.39
##  6     7  3.77  3.19
##  7     8  5.08  3.03
##  8     2  8.25  5.88
##  9     3  4.94  3.95
## 10     4  5.95  4.85
## # ... with 60 more rows

#berdasarkan rmse
best_splineJ %>% slice_min(rmse)

##   df     rmse      mae
## 1  8 2.440677 2.025495

#berdasarkan mae
best_splineJ %>% slice_min(mae)

##   df     rmse     mae
## 1  6 2.534937 1.84849

menghitung rata-rata untuk 10 folds

mean_best_splineJ <- colMeans(best_splineJ)
mean_best_splineJ

##       df     rmse      mae 
## 5.000000 4.350359 3.311150

dim(ns(Auto_J$horsepower, knots = c(60, 70, 80, 90, 100)))

## [1] 79  6

dim(ns(Auto_J$horsepower, knots = c(60, 65, 70, 75, 80, 85, 90)))

## [1] 79  8

splineJ6 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 70, 80, 90, 100)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=6)")

splineJ8 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 65, 70, 75, 80, 85, 90)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS (df=8)")

grid.arrange(splineJ6, splineJ8, ncol=2)

mod_spline6 = lm(mpg ~ ns(horsepower, knots = c(60, 70, 80, 90, 100)),data=Auto_J)
summary(mod_spline6)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(60, 70, 80, 90, 100)), 
##     data = Auto_J)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3101 -2.5469 -0.3148  1.5988 10.7935 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                       32.295      2.170  14.885
## ns(horsepower, knots = c(60, 70, 80, 90, 100))1    1.311      3.063   0.428
## ns(horsepower, knots = c(60, 70, 80, 90, 100))2   -4.295      4.826  -0.890
## ns(horsepower, knots = c(60, 70, 80, 90, 100))3   -3.902      2.849  -1.369
## ns(horsepower, knots = c(60, 70, 80, 90, 100))4  -16.676      3.668  -4.546
## ns(horsepower, knots = c(60, 70, 80, 90, 100))5   -2.550      5.587  -0.456
## ns(horsepower, knots = c(60, 70, 80, 90, 100))6   -5.274      3.772  -1.398
##                                                 Pr(>|t|)    
## (Intercept)                                      < 2e-16 ***
## ns(horsepower, knots = c(60, 70, 80, 90, 100))1    0.670    
## ns(horsepower, knots = c(60, 70, 80, 90, 100))2    0.376    
## ns(horsepower, knots = c(60, 70, 80, 90, 100))3    0.175    
## ns(horsepower, knots = c(60, 70, 80, 90, 100))4 2.16e-05 ***
## ns(horsepower, knots = c(60, 70, 80, 90, 100))5    0.650    
## ns(horsepower, knots = c(60, 70, 80, 90, 100))6    0.166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.064 on 72 degrees of freedom
## Multiple R-squared:  0.589,  Adjusted R-squared:  0.5548 
## F-statistic:  17.2 on 6 and 72 DF,  p-value: 3.165e-12

c.4 Perbandingan ketiga model

nilai_rmse <- rbind(2.427555,
                    4.197867,
                    2.534937
                     )
nilai_mae <- rbind(2.023453,
                   3.273378,
                   1.84849
                     )
nama_model <- c("Regresi polinomial 'J' (d=5)",
               "Piecewise constant 'J' (breaks=4)",
               "Natural cubic spline 'J' (knots=60, 70, 80, 90, 100)"
)

tibble(hasil <- data.frame(nama_model, nilai_rmse, nilai_mae))

## # A tibble: 3 x 3
##   nama_model                                           nilai_rmse nilai_mae
##   <chr>                                                     <dbl>     <dbl>
## 1 Regresi polinomial 'J' (d=5)                               2.43      2.02
## 2 Piecewise constant 'J' (breaks=4)                          4.20      3.27
## 3 Natural cubic spline 'J' (knots=60, 70, 80, 90, 100)       2.53      1.85

polinomialJ4 <- ggplot(Auto_J, aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", formula = y~poly(x, 4), lty = 1, col = "blue",se = F) +
  theme_bw()+
ggtitle("Regpol 'J' (d=4)")

PiecewiseJ4 <- ggplot(Auto_J ,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm",
               formula = y~cut(x,4), 
               lty = 1, col = "blue",se = F)+
  theme_bw()+
ggtitle("FT 'J' Knots=3")

splineJ7 <- ggplot(Auto_J,aes(x=horsepower, y=mpg)) +
                 geom_point(alpha=0.55, color="coral")+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(60, 70, 80, 90, 100)), 
               lty = 1,se=F)+
  theme_bw()+
ggtitle("NCS 'J' (df=7)")

grid.arrange(polinomialJ4, PiecewiseJ4, splineJ7, ncol=3)

Berdasarkan nilai RMSE dan MAE terkecil model terbaik yang diperoleh adalah Naturral Cubic Spline tetapi jika dilihat dari kurva yang tidak mulus maka diambil nilai RMSE dan MAE terkecil kedua yaitu Piecewise Constant dengan knots= 3.

Referensi

Sartono, B. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear

Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/

TERIMAKASIH

MOVING BEYOND LINEARITY

TUGAS 8 STA581

Khusnia Nurul Khikmah

10/24/2021

Data

Soal No 1

a. Regresi polinomial

b. Piecewise constant

c. Natural cubic splines

d. Perbandingan ketiga model

Soal No 2

a. Amerika

a.1 Regresi polinomial

a.2 Piecewice constant

a.3 Natural cubic splines

a.4 Perbandingan ketiga model

b. European

b.1 Regresi polinomial

b.2 Piecewice constant

b.3 Natural cubic splines

b.4 Perbandingan ketiga model

c. Japan

c.1 Regresi polinomial

c.2 Piecewice constant

c.3 Natural cubic splines

c.4 Perbandingan ketiga model

Referensi