#Library

#install.packages("MultiKink") # DATA TRICEPS
library(ISLR)
library(dplyr)
library(MultiKink)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(splines)
library(MatrixModels)

Data yang digunakan berasal dari library ISLR yaitu Data set Auto

Data terdiri dari 3 Kolom yaitu mpg, horsepower, dan origin.

Berikut adalah penjelasannya pada masing-masing kolom:

- mpg : miles per gallon

- hoursepower : Engine horsepower

- origin : Origin of car (1. American, 2. European, 3. Japanese)

Input Data

data <- Auto %>% select(mpg, horsepower, origin)

tibble(data)
## # A tibble: 392 × 3
##      mpg horsepower origin
##    <dbl>      <dbl>  <dbl>
##  1    18        130      1
##  2    15        165      1
##  3    18        150      1
##  4    16        150      1
##  5    17        140      1
##  6    15        198      1
##  7    14        220      1
##  8    14        215      1
##  9    14        225      1
## 10    15        190      1
## # … with 382 more rows

PERTEMUAN 8

Regresi Linier

mod_linear <- lm(data$horsepower ~ data$mpg)
summary(mod_linear)
## 
## Call:
## lm(formula = data$horsepower ~ data$mpg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.892 -15.716  -2.094  13.108  96.947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 194.4756     3.8732   50.21   <2e-16 ***
## data$mpg     -3.8389     0.1568  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.19 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

R-square didapatkan cukup besar yaitu sebesar 60,59 %

ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

Regresi Polinomial Derajat 2 (ordo 2)

mod_polinomial2 = lm(horsepower ~ poly(mpg,2,raw = T), #2 = ordo,raw ga ngaruh T/F
                     data=data)
summary(mod_polinomial2)
## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 2, raw = T), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72.52 -11.24  -0.11   9.47  92.98 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            302.06824    9.25689   32.63   <2e-16 ***
## poly(mpg, 2, raw = T)1 -13.32406    0.77456  -17.20   <2e-16 ***
## poly(mpg, 2, raw = T)2   0.18804    0.01513   12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.49 on 389 degrees of freedom
## Multiple R-squared:  0.718,  Adjusted R-squared:  0.7165 
## F-statistic: 495.1 on 2 and 389 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") + stat_smooth(method = "lm", formula =y~poly(x,2,raw=T),lty = 1, col = "blue",se = F)+theme_bw()

ggplot(data,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = T)+
  theme_bw()

Regresi Polinomial Derajat 3 (ordo 3)

mod_polinomial3 = lm(horsepower ~ poly(mpg,3,raw = T),data=data) #3 = ordo (x^1,x^2,x^3)
summary(mod_polinomial3)
## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 3, raw = T), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.935 -11.251  -1.338   9.324  95.459 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            429.581461  23.823018  18.032  < 2e-16 ***
## poly(mpg, 3, raw = T)1 -30.131244   3.006538 -10.022  < 2e-16 ***
## poly(mpg, 3, raw = T)2   0.866793   0.118533   7.313 1.51e-12 ***
## poly(mpg, 3, raw = T)3  -0.008506   0.001474  -5.770 1.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 388 degrees of freedom
## Multiple R-squared:  0.7403, Adjusted R-squared:  0.7382 
## F-statistic: 368.6 on 3 and 388 DF,  p-value: < 2.2e-16
ringkasan_linear <- summary(mod_linear)
ringkasan_linear$r.squared
## [1] 0.6059483
AIC(mod_linear)
## [1] 3614.324
AIC(mod_polinomial2)
## [1] 3485.217
AIC(mod_polinomial3)
## [1] 3454.949
ggplot(data,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "blue",se = T)+
  theme_bw() #visualisasi ordo 3 lebih baik

AIC(mod_linear)
## [1] 3614.324
AIC(mod_polinomial2)
## [1] 3485.217
AIC(mod_polinomial3)
## [1] 3454.949
AIC <- cbind(AIC(mod_linear),AIC(mod_polinomial2),AIC(mod_polinomial3))
# Regresi Fungsi Tangga (5)------------
mod_tangga5 = lm(horsepower ~ cut(mpg,5),data=data) #cut=break brrrti ada 5x5 cut nya (tangga)
summary(mod_tangga5)
## 
## Call:
## lm(formula = horsepower ~ cut(mpg, 5), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.242 -11.378  -3.000   8.832  71.758 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             158.242      2.260   70.03   <2e-16 ***
## cut(mpg, 5)(16.5,24]    -55.242      2.942  -18.78   <2e-16 ***
## cut(mpg, 5)(24,31.6]    -77.073      3.116  -24.74   <2e-16 ***
## cut(mpg, 5)(31.6,39.1]  -85.971      3.603  -23.86   <2e-16 ***
## cut(mpg, 5)(39.1,46.6]  -98.542      7.182  -13.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.56 on 387 degrees of freedom
## Multiple R-squared:  0.6896, Adjusted R-squared:  0.6863 
## F-statistic: 214.9 on 4 and 387 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,5), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Fungsi Tangga (7)———

mod_tangga7 = lm(horsepower ~ cut(mpg,7),data=data)
summary(mod_tangga7)
## 
## Call:
## lm(formula = horsepower ~ cut(mpg, 7), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.48 -12.96  -2.25  10.31 105.52 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             169.692      2.960   57.32   <2e-16 ***
## cut(mpg, 7)(14.4,19.7]  -45.208      3.669  -12.32   <2e-16 ***
## cut(mpg, 7)(19.7,25.1]  -74.908      3.734  -20.06   <2e-16 ***
## cut(mpg, 7)(25.1,30.5]  -88.317      3.885  -22.73   <2e-16 ***
## cut(mpg, 7)(30.5,35.9]  -96.865      4.186  -23.14   <2e-16 ***
## cut(mpg, 7)(35.9,41.2] -100.442      5.268  -19.07   <2e-16 ***
## cut(mpg, 7)(41.2,46.6] -111.978      8.594  -13.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.35 on 385 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6924 
## F-statistic: 147.7 on 6 and 385 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,7), 
              lty = 1, col = "blue",se = F)+
  theme_bw()

Perbandingan Model———–

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
#Membandingkan model#
nilai_MSE <- rbind(MSE(predict(mod_linear),data$horsepower),
                   MSE(predict(mod_polinomial2),data$horsepower),
                   MSE(predict(mod_polinomial3),data$horsepower),
                   MSE(predict(mod_tangga5),data$horsepower),
                   MSE(predict(mod_tangga7),data$horsepower))
nama_model <- c("Linear","Poly (ordo=2)", "Poly (ordo=3)",
                "Tangga (breaks=5)", "Tangga (breaks=7)")
data.frame(nama_model,nilai_MSE)
##          nama_model nilai_MSE
## 1            Linear  582.3257
## 2     Poly (ordo=2)  416.7871
## 3     Poly (ordo=3)  383.8523
## 4 Tangga (breaks=5)  458.7770
## 5 Tangga (breaks=7)  447.5318
#mse terkecil berada pada tangga (breaks=7) maka itu adalah model terbaik

Evaluasi Model Menggunakan Cross Validation = u/ lebih meyakinkan lg di data training & data testing

Regresi Linear———–

set.seed(123)
cross_val <- vfold_cv(data,v=10,strata = "horsepower") #10 = folds
metric_linear <- map_dfr(cross_val$splits, #
                         function(x){
                           mod <- lm(horsepower ~ mpg,data=data[x$in_id,])
                           pred <- predict(mod,newdata=data[-x$in_id,])
                           truth <- data[-x$in_id,]$horsepower
                           rmse <- mlr3measures::rmse(truth = truth,response = pred)
                           mae <- mlr3measures::mae(truth = truth,response = pred)
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                         }
)
metric_linear
## # A tibble: 10 × 2
##     rmse   mae
##    <dbl> <dbl>
##  1  25.3  20.3
##  2  29.3  22.2
##  3  22.4  16.9
##  4  26.8  19.9
##  5  23.4  18.9
##  6  24.9  20.6
##  7  18.1  15.0
##  8  20.3  13.6
##  9  22.6  18.2
## 10  26.9  18.3

menghitung rata-rata untuk 10 folds - kenpaa 10? bebas berapa aja, yg cocok untuk lipatan

mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
##     rmse      mae 
## 24.00403 18.39118

Polynomial Derajat 2 (ordo 2)

set.seed(123)
cross_val <- vfold_cv(data,v=10,strata = "horsepower")
metric_poly2 <- map_dfr(cross_val$splits, 
                        function(x){
                          mod <- lm(horsepower ~ poly(mpg,2,raw = T),data=data[x$in_id,])
                          pred <- predict(mod,newdata=data[-x$in_id,])
                          truth <- data[-x$in_id,]$horsepower
                          rmse <- mlr3measures::rmse(truth = truth,response = pred)
                          mae <- mlr3measures::mae(truth = truth,response = pred)
                          metric <- c(rmse,mae)
                          names(metric) <- c("rmse","mae")
                          return(metric)
                        }
)
metric_poly2
## # A tibble: 10 × 2
##     rmse   mae
##    <dbl> <dbl>
##  1  23.1  18.9
##  2  25.1  17.2
##  3  18.4  14.3
##  4  22.1  15.1
##  5  21.5  15.6
##  6  19.7  14.8
##  7  15.6  12.2
##  8  16.5  12.3
##  9  18.9  14.6
## 10  22.5  13.8
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
##     rmse      mae 
## 20.35171 14.88587
## Polynomial Derajat 3 (ordo 3)
set.seed(123)
cross_val <- vfold_cv(data,v=10,strata = "horsepower")

metric_poly3 <- map_dfr(cross_val$splits,
                        function(x){
                          mod <- lm(horsepower ~ poly(mpg,3,raw = T),data=data[x$in_id,])
                          pred <- predict(mod,newdata=data[-x$in_id,])
                          truth <- data[-x$in_id,]$horsepower
                          rmse <- mlr3measures::rmse(truth = truth,response = pred)
                          mae <- mlr3measures::mae(truth = truth,response = pred)
                          metric <- c(rmse,mae)
                          names(metric) <- c("rmse","mae")
                          return(metric)
                        }
)
metric_poly3
## # A tibble: 10 × 2
##     rmse   mae
##    <dbl> <dbl>
##  1  20.9  17.0
##  2  24.1  15.8
##  3  17.2  13.2
##  4  22.5  15.2
##  5  21.3  15.6
##  6  18.6  14.7
##  7  15.8  12.0
##  8  16.2  11.9
##  9  17.4  14.2
## 10  22.0  13.5
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
##     rmse      mae 
## 19.59566 14.30442
# Fungsi Tangga ---------
set.seed(123)
cross_val <- vfold_cv(data,v=10,strata = "horsepower")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- data[x$in_id,]
        training$mpg <- cut(training$mpg,i)
        mod <- lm(horsepower ~ mpg,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- data[-x$in_id,]
        mpg_new <- cut(testing$mpg,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(mpg=mpg_new))
        truth <- testing$horsepower
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##   breaks     rmse      mae
## 1      3 27.50270 20.98377
## 2      4 23.86719 16.45644
## 3      5 21.39386 15.39006
## 4      6 21.95923 15.93805
## 5      7 21.33342 15.95446
## 6      8 22.02001 15.73704
## 7      9 20.49678 14.26506
## 8     10 20.01487 14.22925
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks     rmse      mae
## 1     10 20.01487 14.22925
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks     rmse      mae
## 1     10 20.01487 14.22925
## Perbandingan Model
nilai_metric <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      mean_metric_poly3,
                      best_tangga %>% select(-1) %>% slice_min(mae))

nama_model <- c("Linear","Poly2","Poly3","Tangga (breaks=9)")
data.frame(nama_model,nilai_metric)
##          nama_model     rmse      mae
## 1            Linear 24.00403 18.39118
## 2             Poly2 20.35171 14.88587
## 3             Poly3 19.59566 14.30442
## 4 Tangga (breaks=9) 20.01487 14.22925

PERTEMUAN 9

Jika kita gambarkan dalam bentuk scatterplot

ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Spline

#Menentukan banyaknya fungsi basis dan knots
dim(bs(data$mpg, knots = c(10, 20, 40)))
## [1] 392   6
#atau
dim(bs(data$mpg, df=6))
## [1] 392   6
#nilai knots yang ditentukan oleh komputer
attr(bs(data$mpg, df=6),"knots")
##   25%   50%   75% 
## 17.00 22.75 29.00

b-spline Menggunakan knot yang ditentukan manual

knot_value_manual_3 = c(10, 20,40)
mod_spline_1 = lm(horsepower ~ bs(mpg, knots =knot_value_manual_3 ),data=data)
summary(mod_spline_1)
## 
## Call:
## lm(formula = horsepower ~ bs(mpg, knots = knot_value_manual_3), 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.797 -10.978  -1.346   8.752  94.958 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                             193.00      19.66   9.818  < 2e-16 ***
## bs(mpg, knots = knot_value_manual_3)1    23.45      22.31   1.051    0.294    
## bs(mpg, knots = knot_value_manual_3)2   -31.46      20.38  -1.544    0.123    
## bs(mpg, knots = knot_value_manual_3)3  -136.64      21.02  -6.501 2.48e-10 ***
## bs(mpg, knots = knot_value_manual_3)4  -102.70      21.50  -4.777 2.53e-06 ***
## bs(mpg, knots = knot_value_manual_3)5  -149.01      22.67  -6.573 1.60e-10 ***
## bs(mpg, knots = knot_value_manual_3)6  -126.65      26.93  -4.702 3.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.66 on 385 degrees of freedom
## Multiple R-squared:  0.7432, Adjusted R-squared:  0.7392 
## F-statistic: 185.7 on 6 and 385 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = knot_value_manual_3), 
               lty = 1,se = F)

Menggunakan knot yang ditentukan fungsi komputer

knot_value_pc_df_6 = attr(bs(data$mpg, df=6),"knots")
mod_spline_1 = lm(horsepower ~ bs(mpg, knots =knot_value_pc_df_6 ),data=data)
summary(mod_spline_1)
## 
## Call:
## lm(formula = horsepower ~ bs(mpg, knots = knot_value_pc_df_6), 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.043 -10.103  -0.818   8.075  95.778 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           195.012     13.181  14.795  < 2e-16 ***
## bs(mpg, knots = knot_value_pc_df_6)1    2.792     18.879   0.148    0.882    
## bs(mpg, knots = knot_value_pc_df_6)2  -80.488     12.649  -6.363 5.62e-10 ***
## bs(mpg, knots = knot_value_pc_df_6)3 -103.774     14.645  -7.086 6.62e-12 ***
## bs(mpg, knots = knot_value_pc_df_6)4 -126.115     14.600  -8.638  < 2e-16 ***
## bs(mpg, knots = knot_value_pc_df_6)5 -127.143     18.836  -6.750 5.45e-11 ***
## bs(mpg, knots = knot_value_pc_df_6)6 -141.413     18.226  -7.759 7.77e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.56 on 385 degrees of freedom
## Multiple R-squared:  0.7457, Adjusted R-squared:  0.7417 
## F-statistic: 188.1 on 6 and 385 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = knot_value_pc_df_6), 
               lty = 1,se = F)

Natural Spline

knot_value_manual_3 = c(10, 20,40)
mod_spline3ns = lm(horsepower ~ ns(mpg, knots = knot_value_manual_3),data=data)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = horsepower ~ ns(mpg, knots = knot_value_manual_3), 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.883 -10.919  -2.038   9.498  94.553 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            215.774     10.064  21.441   <2e-16 ***
## ns(mpg, knots = knot_value_manual_3)1 -157.655      9.363 -16.838   <2e-16 ***
## ns(mpg, knots = knot_value_manual_3)2 -122.005     11.837 -10.307   <2e-16 ***
## ns(mpg, knots = knot_value_manual_3)3 -203.251     20.752  -9.794   <2e-16 ***
## ns(mpg, knots = knot_value_manual_3)4 -136.070     10.659 -12.766   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 387 degrees of freedom
## Multiple R-squared:  0.741,  Adjusted R-squared:  0.7383 
## F-statistic: 276.8 on 4 and 387 DF,  p-value: < 2.2e-16
ggplot(data,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black")+
      stat_smooth(method = "lm", 
                 formula = y~ns(x, knots = knot_value_manual_3), 
                 lty = 1,se=F)

Smoothing Splin Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R

model_sms <- with(data = data,smooth.spline(mpg,horsepower))
model_sms
## Call:
## smooth.spline(x = mpg, y = horsepower)
## 
## Smoothing Parameter  spar= 0.8528728  lambda= 0.003422279 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.989153
## Penalized Criterion (RSS): 35962.53
## GCV: 386.3516

Fungsi smooth.spline secara otomatis memilih paramter lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized Cross-Validation (GCV). Perbedaan utama antara keduanya adalah pada ukuran kebaikan model yang digunakan. Kalau CV menggunakan MSE sebagai ukuran kebaikan model, sedangkan GCV menggunakan Weighted-MSE. Banyaknya folds (lipatan) pada CV dan GCV ini adalah sejumlah obserbasinya.

pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("age")+
  ylab("triceps")+
  theme_bw()

Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_sms_lambda <- data.frame(lambda=seq(0,5,by=0.5)) %>% 
  group_by(lambda) %>% 
  do(broom::augment(with(data = data,smooth.spline(mpg,horsepower,lambda = .$lambda))))

p <- ggplot(model_sms_lambda,
       aes(x=x,y=y))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~lambda)
p

Jika kita tentukan df=7, maka hasil kurva model smooth.spline akan lebih merepresentasikan data

model_sms <- with(data = data,smooth.spline(mpg,horsepower,df=7))
model_sms
## Call:
## smooth.spline(x = mpg, y = horsepower, df = 7)
## 
## Smoothing Parameter  spar= 0.8524981  lambda= 0.003401012 (13 iterations)
## Equivalent Degrees of Freedom (Df): 6.998735
## Penalized Criterion (RSS): 35955.27
## GCV: 386.3517
pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("mpg")+
  ylab("horsepower")+
  theme_bw()

LOESS Masih menggunakan data triceps seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().

model_loess <- loess(horsepower~mpg,
                     data = data)
summary(model_loess)
## Call:
## loess(formula = horsepower ~ mpg, data = data)
## 
## Number of Observations: 392 
## Equivalent Number of Parameters: 4.8 
## Residual Standard Error: 19.58 
## Trace of smoother matrix: 5.24  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_loess_span <- data.frame(span=seq(0.1,5,by=0.5)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(horsepower~mpg,
                     data = data,span=.$span)))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 6.6352e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
p2 <- ggplot(model_loess_span,
       aes(x=mpg,y=horsepower))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~span)
p2

Pendekatan ini juga dapat dilakukan dengan fungsi stat_smooth() pada package ggplot2.

library(ggplot2)
ggplot(data, aes(mpg,horsepower)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
               formula=y~x,
              span = 0.75,
              col="blue",
              lty=1,
              se=F)

Tuning span dapat dilakukan dengan menggunakan cross-validation:

set.seed(123)
cross_val <- vfold_cv(data,v=10,strata = "horsepower")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(horsepower ~ mpg,span = i,
         data=data[x$in_id,])
pred <- predict(mod,
               newdata=data[-x$in_id,])
truth <- data[-x$in_id,]$horsepower

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.5634e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.6179e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.6592e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.6453e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.5809e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1298e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.0975e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 3.8829e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1001e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.5972e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.5634e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 4.971e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1005e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1001e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.0996e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.098e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.6592e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 6.6309e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.5809e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1298e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.6179e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-16
best_loess <- cbind(span=span,best_loess)
# menampilkan hasil all breaks
best_loess
##         span     rmse      mae
## 1  0.1000000 20.00105 14.28375
## 2  0.1183673 19.96338 14.22833
## 3  0.1367347 19.71640 14.12976
## 4  0.1551020 19.67018 14.09036
## 5  0.1734694 19.57790 14.00560
## 6  0.1918367 19.57762 14.02209
## 7  0.2102041 19.56415 13.94293
## 8  0.2285714 19.46750 13.71398
## 9  0.2469388 19.45020 13.72098
## 10 0.2653061 19.42651 13.69578
## 11 0.2836735 19.42832 13.69567
## 12 0.3020408 19.39041 13.65689
## 13 0.3204082 19.39218 13.66939
## 14 0.3387755 19.38693 13.68450
## 15 0.3571429 19.39189 13.68922
## 16 0.3755102 19.37590 13.68882
## 17 0.3938776 19.37627 13.69517
## 18 0.4122449 19.37478 13.69975
## 19 0.4306122 19.35870 13.70665
## 20 0.4489796 19.35740 13.70765
## 21 0.4673469 19.34883 13.71625
## 22 0.4857143 19.35160 13.71946
## 23 0.5040816 19.34070 13.72932
## 24 0.5224490 19.35629 13.73985
## 25 0.5408163 19.35799 13.74649
## 26 0.5591837 19.36481 13.75812
## 27 0.5775510 19.37346 13.77230
## 28 0.5959184 19.38235 13.79317
## 29 0.6142857 19.38671 13.80130
## 30 0.6326531 19.39340 13.81919
## 31 0.6510204 19.39811 13.83825
## 32 0.6693878 19.41165 13.86356
## 33 0.6877551 19.41740 13.88092
## 34 0.7061224 19.42807 13.90341
## 35 0.7244898 19.43267 13.91519
## 36 0.7428571 19.44687 13.94586
## 37 0.7612245 19.45099 13.95664
## 38 0.7795918 19.46082 13.97140
## 39 0.7979592 19.46725 13.98390
## 40 0.8163265 19.47076 13.99522
## 41 0.8346939 19.48650 14.02348
## 42 0.8530612 19.48605 14.02414
## 43 0.8714286 19.49624 14.03422
## 44 0.8897959 19.51399 14.06612
## 45 0.9081633 19.51819 14.07623
## 46 0.9265306 19.53973 14.09993
## 47 0.9448980 19.54760 14.11187
## 48 0.9632653 19.57661 14.15542
## 49 0.9816327 19.62677 14.21747
## 50 1.0000000 19.78647 14.36500
#berdasarkan rmse
best_loess %>% slice_min(rmse)
##        span    rmse      mae
## 1 0.5040816 19.3407 13.72932
#berdasarkan mae
best_loess %>% slice_min(mae)
##        span     rmse      mae
## 1 0.3020408 19.39041 13.65689
library(ggplot2)
ggplot(data, aes(mpg,horsepower)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.4306122,
              col="blue",
              lty=1,
              se=F)