Laporan Responsi I (P08 dan P09)

Faadiyah Ramadhani (G1401201009)

2022-11-04

MATERI PRAKTIKUM 08

Library

library(MultiKink)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(PerformanceAnalytics)

DATA BANGKITAN

Data

set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000)
y <- 5+2*data.x+3*data.x^2+err #Polynomial ordo 2

Eksplorasi Data

Korelasi

cor(data.x,y)
## [1] 0.892582

Diperoleh korelasi antara data.x dengan y sebesar 0.8023291

Plot

plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))

Regresi Linear

lin.mod <-lm( y~data.x)
plot(data.x,y,xlim=c( -2,5), ylim=c( -10,70))
lines(data.x,lin.mod$fitted.values,col="red") #lines x dengan nilai prediksinya (yhat)

summary(lin.mod)
## 
## Call:
## lm(formula = y ~ data.x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.686 -2.574 -1.428  1.195 27.185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6056     0.1902   24.22   <2e-16 ***
## data.x        8.3790     0.1340   62.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.2 on 998 degrees of freedom
## Multiple R-squared:  0.7967, Adjusted R-squared:  0.7965 
## F-statistic:  3911 on 1 and 998 DF,  p-value: < 2.2e-16

Regresi Polynomial

poly.mod<- lm(y~data.x+I(data.x^2)) 
ix <- sort( data.x,index.return=T)$ix #nilai diurutkan berdasarkan nilai x lalu dikeluarkan nilai indeksnya(observasi ke-berapa)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x[ix], poly.mod$fitted.values[ix],col="blue", cex=2)

summary(poly.mod)
## 
## Call:
## lm(formula = y ~ data.x + I(data.x^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0319 -0.6942  0.0049  0.7116  3.2855 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.95193    0.04568  108.41   <2e-16 ***
## data.x       2.10732    0.05861   35.95   <2e-16 ***
## I(data.x^2)  2.99081    0.02338  127.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.007 on 997 degrees of freedom
## Multiple R-squared:  0.9883, Adjusted R-squared:  0.9883 
## F-statistic: 4.221e+04 on 2 and 997 DF,  p-value: < 2.2e-16

Regresi Fungsi Tangga

#regresi fungsi tangga
range(data.x) #nilai minimum dan maksimum
## [1] -1.809775  4.241040

Diperoleh nilai minimum sebesar -1.809775 dan nilai maksimum sebesar 4.241040

#membuat fungsi tangga secara manual 
c1 <- as.factor(ifelse(data.x<=0,1,0))
c2 <- as.factor(ifelse(data.x<=2 & data.x>0,1,0))
c3 <- as.factor(ifelse(data.x>2,1,0))
head(data.frame(y,c1,c2,c3))
##           y c1 c2 c3
## 1  5.462795  0  1  0
## 2  7.277570  0  1  0
## 3 29.740401  0  0  1
## 4 10.446806  0  1  0
## 5  8.535105  0  1  0
## 6 33.585437  0  0  1
step.mod <- lm(y~c1+c2+c3)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x,lin.mod$fitted.values,col="red")
lines(data.x[ix], step.mod$fitted.values[ix],col="green")

summary(step.mod)
## 
## Call:
## lm(formula = y ~ c1 + c2 + c3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.395  -3.534  -0.530   2.527  36.876 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.4499     0.4087   74.50   <2e-16 ***
## c11         -25.2395     0.5710  -44.21   <2e-16 ***
## c21         -19.4184     0.4536  -42.81   <2e-16 ***
## c31               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.121 on 997 degrees of freedom
## Multiple R-squared:  0.698,  Adjusted R-squared:  0.6974 
## F-statistic:  1152 on 2 and 997 DF,  p-value: < 2.2e-16

Perbandingan Model

Nilai AIC

perbandinganAIC <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)", "Tangga (breaks=3)"), "AIC" = c(AIC(lin.mod),AIC(poly.mod),AIC(step.mod)))
dplyr::arrange(.data=perbandinganAIC, AIC)
##                 Model      AIC
## 1 Polynomial (Ordo=2) 2856.470
## 2              Linear 5711.836
## 3   Tangga (breaks=3) 6109.609

Nilai MSE

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
perbandinganMSE <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)", "Tangga (breaks=3)"), "MSE" = c(MSE(predict(lin.mod),y),MSE(predict(poly.mod),y),MSE(predict(step.mod),y)))
dplyr::arrange(.data=perbandinganMSE, MSE)
##                 Model       MSE
## 1 Polynomial (Ordo=2)  1.010649
## 2              Linear 17.601059
## 3   Tangga (breaks=3) 26.146934

Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial (Ordo=2) memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial (Ordo=2) merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.

DATA TRICEPS

Data yang digunakan untuk ilustrasi berasal dari studi antropometri terhadap 892 perempuan di bawah 50 tahun di tiga desa Gambia di Afrika Barat. Data terdiri dari 3 Kolom yaitu Age, Intriceps dan tricepts. Berikut adalah penjelasannya pada masing-masing kolom:

  1. age : umur responden

  2. Intriceps : logaritma dari ketebalan lipatan kulit triceps

  3. triceps: ketebalan lipatan kulit triceps

Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein.

Data

data("triceps", package="MultiKink")
dt.triceps <- data.frame("age"=triceps$age,
                         "lntriceps"=triceps$lntriceps,
                         "triceps"=triceps$triceps)
head(dt.triceps)
##     age lntriceps triceps
## 1 12.05  1.223776     3.4
## 2  9.91  1.386294     4.0
## 3 10.04  1.435084     4.2
## 4 11.49  1.435084     4.2
## 5 10.12  1.481605     4.4
## 6 11.87  1.481605     4.4

Eksplorasi Data

Korelasi

cor(dt.triceps)
##                 age lntriceps   triceps
## age       1.0000000 0.5776443 0.5806972
## lntriceps 0.5776443 1.0000000 0.9612043
## triceps   0.5806972 0.9612043 1.0000000

Plot Korelasi

chart.Correlation(dt.triceps)

Berdasarkan plot korelasi di atas, diperoleh bahwa korelasi antara peubah AGE dengan peubah lntriceps bernilai positif sebesar 0.58. Nilai korelasi antara peubah AGE dengan peubah triceps juga bernilai positif sebesar 0.58. Kemudian, nilai korelasi antara peubah lntriceps dengan peubah triceps bernilai positif sebesar 0.96.

Scatterplot

ggplot(dt.triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw()

Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Linear

mod_linear = lm(triceps~age,data=dt.triceps)
summary(mod_linear)
## 
## Call:
## lm(formula = triceps ~ age, data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9512  -2.3965  -0.5154   1.5822  25.1233 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.19717    0.21244   29.17   <2e-16 ***
## age          0.21584    0.01014   21.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared:  0.3372, Adjusted R-squared:  0.3365 
## F-statistic: 452.8 on 1 and 890 DF,  p-value: < 2.2e-16
ringkasan_linear <- summary(mod_linear)
ringkasan_linear$r.squared
## [1] 0.3372092
AIC(mod_linear)
## [1] 5011.515
tabel_data <- rbind(data.frame("y_aktual"=dt.triceps$triceps,
                               "y_pred"=predict(mod_linear),
                               "residual"=residuals(mod_linear)))
head(tabel_data)
##   y_aktual   y_pred  residual
## 1      3.4 8.798074 -5.398074
## 2      4.0 8.336171 -4.336171
## 3      4.2 8.364231 -4.164231
## 4      4.2 8.677202 -4.477202
## 5      4.4 8.381498 -3.981498
## 6      4.4 8.759222 -4.359222
ggplot(dt.triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

Regresi Polynomial Derajat 2 (Ordo=2)

mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),
                     data=dt.triceps)
summary(mod_polinomial2)
## 
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5677  -2.4401  -0.4587   1.6368  24.9961 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.0229191  0.3063806  19.658  < 2e-16 ***
## poly(age, 2, raw = T)1  0.2434733  0.0364403   6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257  0.0007926  -0.789     0.43    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared:  0.3377, Adjusted R-squared:  0.3362 
## F-statistic: 226.6 on 2 and 889 DF,  p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

ggplot(dt.triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = T) + theme_bw()  #ada standart errornya

Regresi Polynomial Derajat 3 (Ordo=3)

mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=dt.triceps)
summary(mod_polinomial3)
## 
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5832  -1.9284  -0.5415   1.3283  24.4440 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.004e+00  3.831e-01  20.893  < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01  7.721e-02  -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2  3.101e-02  3.964e-03   7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04  5.612e-05  -8.135 1.38e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared:  0.3836, Adjusted R-squared:  0.3815 
## F-statistic: 184.2 on 3 and 888 DF,  p-value: < 2.2e-16

Berdasarkan hasil di atas, diperoleh bahwa X^1, X^2, X^3 seluruhnya signifikan serta terdapat kenaikan pada R-squared menjadi 0.3815

ggplot(dt.triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = T) + theme_bw()

Perbandingan Model

Nilai AIC

perbandinganAIC2 <- data.frame("Model" = c("Model Linear", "Model Polynomial (Ordo=2)", "Model Polynomial (Ordo=3)"), "AIC" = c(AIC(mod_linear),AIC(mod_polinomial2), AIC(mod_polinomial3)))
dplyr::arrange(.data=perbandinganAIC2, AIC)
##                       Model      AIC
## 1 Model Polynomial (Ordo=3) 4950.774
## 2              Model Linear 5011.515
## 3 Model Polynomial (Ordo=2) 5012.890

Nilai MSE

perbandinganMSE2 <- data.frame("Model" = c("Model Linear", "Model Polynomial (Ordo=2)", "Model Polynomial (Ordo=3)"), "MSE" = c(MSE(predict(mod_linear),dt.triceps$triceps),                        MSE(predict(mod_polinomial2),dt.triceps$triceps),
  MSE(predict(mod_polinomial3),dt.triceps$triceps)))
dplyr::arrange(.data=perbandinganMSE2, MSE)
##                       Model      MSE
## 1 Model Polynomial (Ordo=3) 14.89621
## 2 Model Polynomial (Ordo=2) 16.00636
## 3              Model Linear 16.01758

Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial (Ordo=3) memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial (Ordo=3) merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.

Regresi Fungsi Tangga (breaks=5)

mod_tangga5 = lm(triceps ~ cut(age,5),data=dt.triceps)
summary(mod_tangga5)
## 
## Call:
## lm(formula = triceps ~ cut(age, 5), data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5474  -2.0318  -0.4465   1.3682  23.3759 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.2318     0.1994  36.260  < 2e-16 ***
## cut(age, 5)(10.6,20.9]   1.6294     0.3244   5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2]   5.9923     0.4222  14.192  < 2e-16 ***
## cut(age, 5)(31.2,41.5]   7.5156     0.4506  16.678  < 2e-16 ***
## cut(age, 5)(41.5,51.8]   7.4561     0.5543  13.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.3588 
## F-statistic: 125.7 on 4 and 887 DF,  p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Fungsi Tangga (breaks=7)

mod_tangga7 = lm(triceps ~ cut(age,7),data=dt.triceps)
summary(mod_tangga7)
## 
## Call:
## lm(formula = triceps ~ cut(age, 7), data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8063  -1.7592  -0.4366   1.2894  23.1461 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.5592     0.2219  34.060  < 2e-16 ***
## cut(age, 7)(7.62,15]    -0.6486     0.3326  -1.950   0.0515 .  
## cut(age, 7)(15,22.3]     3.4534     0.4239   8.146 1.27e-15 ***
## cut(age, 7)(22.3,29.7]   5.8947     0.4604  12.804  < 2e-16 ***
## cut(age, 7)(29.7,37]     7.8471     0.5249  14.949  < 2e-16 ***
## cut(age, 7)(37,44.4]     6.9191     0.5391  12.835  < 2e-16 ***
## cut(age, 7)(44.4,51.8]   6.3013     0.6560   9.606  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.805 on 885 degrees of freedom
## Multiple R-squared:  0.4055, Adjusted R-squared:  0.4014 
## F-statistic: 100.6 on 6 and 885 DF,  p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Perbandingan Model

Nilai MSE

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
MSE <- rbind(MSE(predict(mod_linear),triceps$triceps),
              MSE(predict(mod_polinomial2),triceps$triceps),
              MSE(predict(mod_polinomial3),triceps$triceps),
              MSE(predict(mod_tangga5),triceps$triceps),
              MSE(predict(mod_tangga7),triceps$triceps))
Model <- c("Linear","Polynomial (ordo=2)", 
           "Polynomial (ordo=3)","Tangga (breaks=5)", 
           "Tangga (breaks=7)")
data.frame(Model,MSE)
##                 Model      MSE
## 1              Linear 16.01758
## 2 Polynomial (ordo=2) 16.00636
## 3 Polynomial (ordo=3) 14.89621
## 4   Tangga (breaks=5) 15.42602
## 5   Tangga (breaks=7) 14.36779

Berdasarkan hasil di atas, diperoleh bahwa Model Fungsi Tangga (breaks=7) memiliki nilai MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Fungsi Tangga (breaks=7) merupakan model terbaik berdasarkan nilai nilai MSE yang paling kecil dibandingkan dengan model lainnya.

Cross Validation Evaluation

Regresi Linear

set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps") #v yang digunakan biasanya 5 atau 10
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(triceps ~ age,data=dt.triceps[x$in_id,])
    pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
    truth <- dt.triceps[-x$in_id,]$triceps
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.65  2.82
##  2  4.62  3.22
##  3  4.38  3.00
##  4  3.85  2.80
##  5  3.08  2.36
##  6  3.83  2.81
##  7  3.59  2.78
##  8  4.66  3.06
##  9  3.50  2.56
## 10  4.59  2.93
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
##     rmse      mae 
## 3.973249 2.833886

Regresi Polynomial Derajat 2 (Ordo=2)

set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps") 
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(triceps ~ poly(age,2,raw = T),data=dt.triceps[x$in_id,])
  pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
  truth <- dt.triceps[-x$in_id,]$triceps
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly2
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.64  2.82
##  2  4.62  3.26
##  3  4.39  3.02
##  4  3.85  2.81
##  5  3.10  2.38
##  6  3.82  2.81
##  7  3.62  2.83
##  8  4.65  3.07
##  9  3.50  2.57
## 10  4.59  2.94
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
##     rmse      mae 
## 3.977777 2.851787

Regresi Polynomial Derajat 3 (Ordo=3)

set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps")

metric_poly3 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(triceps ~ poly(age,3,raw = T),data=dt.triceps[x$in_id,])
  pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
  truth <- dt.triceps[-x$in_id,]$triceps
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly3
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.49  2.60
##  2  4.48  2.99
##  3  4.21  2.85
##  4  4.02  2.75
##  5  3.03  2.09
##  6  3.63  2.59
##  7  3.53  2.52
##  8  4.54  2.91
##  9  3.27  2.33
## 10  4.27  2.68
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
##     rmse      mae 
## 3.845976 2.632125

Regresi Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps")
breaks <- 3:10 #titik cutnya pada 3-10 (dicari break yang paling baik)
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- dt.triceps[x$in_id,]
        training$age <- cut(training$age,i)
        mod <- lm(triceps ~ age,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
        upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
        testing <- triceps[-x$in_id,]
        age_new <-cut(testing$age,c(labs_x_breaks[1,1],
                                    labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(age=age_new))
        truth <- testing$triceps
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                   response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,
                                 response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##   breaks     rmse      mae
## 1      3 3.835357 2.618775
## 2      4 3.882932 2.651911
## 3      5 3.917840 2.724368
## 4      6 3.836068 2.622939
## 5      7 3.789715 2.555062
## 6      8 3.812789 2.555563
## 7      9 3.781720 2.518706
## 8     10 3.795877 2.529479
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks    rmse      mae
## 1      9 3.78172 2.518706
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks    rmse      mae
## 1      9 3.78172 2.518706

Berdasarkan hasil di atas, diperoleh bahwa Model Fungsi Tangga yang terbaik adalah Model Fungsi Tangga (breaks=9) karena menghasilkan nilai RMSE dan MAE yang paling kecil dibandingkan dengan model lainnya.

Perbandingan Model

Nilai RMSE dan MAE

nilai_metric <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      mean_metric_poly3,
                      best_tangga %>% select(-1) %>% slice_min(mae))

Model <- c("Linear","Poly2","Poly3","Tangga (breaks=9)")
data.frame(Model,nilai_metric)
##               Model     rmse      mae
## 1            Linear 3.973249 2.833886
## 2             Poly2 3.977777 2.851787
## 3             Poly3 3.845976 2.632125
## 4 Tangga (breaks=9) 3.781720 2.518706

Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Fungsi Tangga (breaks=9) menghasilkan RMSE dan MAE yang paling kecil sehingga dapat disimpulkan bahwa Model Regresi Fungsi Tangga (breaks=9) merupakan model terbaik dibandingkan dengan model lainnya.

MATERI PRAKTIKUM 09

Library

#install.packages("MultiKink")
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(splines)

DATA BANGKITAN

set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000,0,5)
y <- 5+2*data.x+3*data.x^2+err
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16, col="orange")
abline(v=1, col="red", lty=2)

Piecewise Cubic Polynom

dt.all <- cbind(y,data.x)
##knots=1
dt1 <- dt.all[data.x <=1,]
dim(dt1)
## [1] 495   2
dt2 <- dt.all[data.x >1,]
dim(dt2)
## [1] 505   2

Berdasarkan hasil di atas, diperoleh bahwa jumlah observasi pada dt.1 (data.x <= 1) sebanyak 495 dan pada dt.2 (data.x > 1) sebanyak 505 observasi.

plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16, col="orange")

cub.mod1 <- lm(dt1[,1]~dt1[,2]+I(dt1[,2]^2)+I(dt1[,2]^3))
ix <- sort(dt1[,2], index.return=T)$ix
lines(dt1[ix,2],cub.mod1$fitted.values[ix],col="blue", lwd=2)

cub.mod2 <- lm(dt2[,1]~dt2[,2]+I(dt2[,2]^2)+I(dt2[,2]^3))
ix <- sort(dt2[,2], index.return=T)$ix
lines(dt2[ix,2],cub.mod2$fitted.values[ix],col="blue", lwd=2)

Berdasarkan plot di atas, masih terdapat cut off pada garis biru sehingga dapat menggunakan Truncated Power Basis untuk menyambungkannya.

Truncated Power Basis

#Dengan menggunakan Truncated Power Basis
plot(data.x,y,xlim=c( -2,5), ylim=c( -10,70), pch=16,col="orange")
abline(v=1,col="red", lty=2)

hx <- ifelse( data.x>1,(data.x-1)^3,0)

cubspline.mod <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix], cubspline.mod$fitted.values[ix],col="green", lwd=2)

Perbandingan dengan k-fold CV

plot( data.x,y,xlim=c(-2,5), ylim=c( -10,70), pch=16,col="orange")
abline(v=0,col="red", lty=2)
abline(v=2,col="red", lty=2)

hx1 <- ifelse( data.x>0,(data.x-0)^3,0)
hx2 <- ifelse( data.x>2,(data.x-2)^3,0)

cubspline.mod2 <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx1+hx2)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix],cubspline.mod2$fitted.values[ix],col="green", lwd=2)

Perbandingan 1 knot dan 2 knot dengan 5-fold CV

##1 knot
data.all <- cbind( y,data.x,hx)
set.seed(456)
ind <- sample(1:5,length( data.x),replace=T)
res <- c()
for( i in 1:5){
  dt.train <- data.all[ ind!=i,]
  x.test <- data.all[ ind==i, -1]
  y.test <- data.all[ ind==i,1]
  mod1 <- lm( dt.train[,1]~dt.train[,2]+I( dt.train[,2]^2)+I( dt.train[,2]^3)+dt.train[,3])
  x.test.olah <- cbind(1,x.test[,1], x.test[,1]^2,x.test[,1]^3,x.test[,2])
  beta <- coefficients(mod1)
  prediksi <- x.test.olah%*%beta
  res <- c( res,mean(( y.test-prediksi)^2))
}
res
## [1] 22.08767 21.39598 29.47677 29.68683 25.18070
##2 knot (knot = 0, 2)
data.all2 <- cbind(y,data.x,hx1,hx2)
set.seed(456)
ind2 <- sample(1:5,length( data.x),replace=T)
res2 <- c()
for( i in 1:5){
  dt.train2 <- data.all2[ind2!=i,]
  x.test2 <- data.all2[ind2==i,-1]
  y.test2 <- data.all2[ind2==i,1]
  mod2 <- lm(dt.train2[,1]~dt.train2[,2]+I(dt.train2[,2]^2)+I(dt.train2[,2]^3)+dt.train2[,3]+dt.train2[,4])
  x.test.olah2 <- cbind(1,x.test2[,1],x.test2[,1]^2,x.test2[,1]^3,x.test2[,2],x.test2[,3])
  beta2 <- coefficients(mod2)
  prediksi2 <- x.test.olah2%*%beta2
  res2 <- c(res2,mean((y.test2-prediksi2)^2))
}
res2
## [1] 22.32898 21.33868 29.40459 29.79913 25.22047

Perbandingan Nilai Residual Antara KNOT = 1 dan KNOT = 2

PerbandinganRES <-matrix(c(mean(res),mean(res2)),nrow=1,ncol=2)
colnames(PerbandinganRES)<- c("KNOT = 1","KNOT = 2")
row.names(PerbandinganRES) <- c("Rata-rata Residual")
knitr::kable(PerbandinganRES, align = "c")
KNOT = 1 KNOT = 2
Rata-rata Residual 25.56559 25.61837

Berdasarkan nilai rata-rata residual, dapat disimpulkan bahwa KNOT = 1 lebih baik karena memiliki nilai rata-rata residual yang lebih kecil dibandingkan dengan KNOT = 2.

Smooting Spline

ss1 <- smooth.spline(data.x,y,all.knots = T)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16,col="orange")
lines(ss1,col="purple", lwd=2)

DATA TRICEPS

Data yang digunakan untuk ilustrasi berasal dari studi antropometri terhadap 892 perempuan di bawah 50 tahun di tiga desa Gambia di Afrika Barat. Data terdiri dari 3 Kolom yaitu Age, Intriceps dan tricepts. Berikut adalah penjelasannya pada masing-masing kolom:

  1. age : umur responden

  2. Intriceps : logaritma dari ketebalan lipatan kulit triceps

  3. triceps : ketebalan lipatan kulit triceps

Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein.

Data

data("triceps", package="MultiKink")
dt.triceps <- data.frame("age"=triceps$age,
                         "lntriceps"=triceps$lntriceps,
                         "triceps"=triceps$triceps)
head(dt.triceps)
##     age lntriceps triceps
## 1 12.05  1.223776     3.4
## 2  9.91  1.386294     4.0
## 3 10.04  1.435084     4.2
## 4 11.49  1.435084     4.2
## 5 10.12  1.481605     4.4
## 6 11.87  1.481605     4.4

Eksplorasi Data

Scatterplot

ggplot(dt.triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Spline

#Menentukan banyaknya fungsi basis dan knots
dim(bs(dt.triceps$age, knots = c(10, 20,40)))
## [1] 892   6
  #atau
dim(bs(dt.triceps$age, df=6))
## [1] 892   6
#nilai knots yang ditentukan oleh komputer
attr(bs(dt.triceps$age, df=6),"knots")
##     25%     50%     75% 
##  5.5475 12.2100 24.7275

b-spline

Menggunakan Knot yang Ditentukan Manual

knot_value_manual_3 = c(10, 20,40)
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_manual_3 ),data=dt.triceps)
summary(mod_spline_1)
## 
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_manual_3), 
##     data = dt.triceps)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.385  -1.682  -0.393   1.165  22.855 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            8.22533    0.61873  13.294  < 2e-16 ***
## bs(age, knots = knot_value_manual_3)1 -0.06551    1.14972  -0.057    0.955    
## bs(age, knots = knot_value_manual_3)2 -4.30051    0.72301  -5.948 3.90e-09 ***
## bs(age, knots = knot_value_manual_3)3  7.80435    1.17793   6.625 6.00e-11 ***
## bs(age, knots = knot_value_manual_3)4  6.14890    1.27439   4.825 1.65e-06 ***
## bs(age, knots = knot_value_manual_3)5  5.56640    1.42225   3.914 9.78e-05 ***
## bs(age, knots = knot_value_manual_3)6  7.90178    1.54514   5.114 3.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.76 on 885 degrees of freedom
## Multiple R-squared:  0.4195, Adjusted R-squared:  0.4155 
## F-statistic: 106.6 on 6 and 885 DF,  p-value: < 2.2e-16
knot_value_manual_3 = c(10, 20,40)
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_manual_3 ),data=dt.triceps)
summary(mod_spline_1)
## 
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_manual_3), 
##     data = dt.triceps)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.385  -1.682  -0.393   1.165  22.855 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            8.22533    0.61873  13.294  < 2e-16 ***
## bs(age, knots = knot_value_manual_3)1 -0.06551    1.14972  -0.057    0.955    
## bs(age, knots = knot_value_manual_3)2 -4.30051    0.72301  -5.948 3.90e-09 ***
## bs(age, knots = knot_value_manual_3)3  7.80435    1.17793   6.625 6.00e-11 ***
## bs(age, knots = knot_value_manual_3)4  6.14890    1.27439   4.825 1.65e-06 ***
## bs(age, knots = knot_value_manual_3)5  5.56640    1.42225   3.914 9.78e-05 ***
## bs(age, knots = knot_value_manual_3)6  7.90178    1.54514   5.114 3.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.76 on 885 degrees of freedom
## Multiple R-squared:  0.4195, Adjusted R-squared:  0.4155 
## F-statistic: 106.6 on 6 and 885 DF,  p-value: < 2.2e-16

Menggunakan Knot yang Ditentukan Fungsi Komputer

knot_value_pc_df_6 = attr(bs(dt.triceps$age, df=6),"knots")
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_pc_df_6 ),data=dt.triceps)
summary(mod_spline_1)
## 
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_pc_df_6), data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0056  -1.7556  -0.2944   1.2008  23.0695 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            6.5925     0.8770   7.517 1.37e-13 ***
## bs(age, knots = knot_value_pc_df_6)1   3.7961     1.5015   2.528  0.01164 *  
## bs(age, knots = knot_value_pc_df_6)2  -2.0749     0.8884  -2.335  0.01974 *  
## bs(age, knots = knot_value_pc_df_6)3   1.5139     1.1645   1.300  0.19391    
## bs(age, knots = knot_value_pc_df_6)4  11.6394     1.3144   8.855  < 2e-16 ***
## bs(age, knots = knot_value_pc_df_6)5   5.9680     1.5602   3.825  0.00014 ***
## bs(age, knots = knot_value_pc_df_6)6   8.9127     1.4053   6.342 3.60e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.757 on 885 degrees of freedom
## Multiple R-squared:  0.4206, Adjusted R-squared:  0.4167 
## F-statistic: 107.1 on 6 and 885 DF,  p-value: < 2.2e-16
data_hasil <- cbind("y_aktual"=dt.triceps$triceps,
                    "y_pred"=mod_spline_1$fitted.values,
                    "residual"=resid(mod_spline_1))
head(data_hasil)
##   y_aktual   y_pred  residual
## 1      3.4 7.333196 -3.933196
## 2      4.0 6.380136 -2.380136
## 3      4.2 6.426062 -2.226062
## 4      4.2 7.052817 -2.852817
## 5      4.4 6.455335 -2.055335
## 6      4.4 7.241677 -2.841677
ggplot(dt.triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = knot_value_pc_df_6), 
               lty = 1,se = F)

Natural Spline

knot_value_manual_3 = c(10, 20,40)
mod_spline3ns = lm(triceps ~ ns(age, knots = knot_value_manual_3),data=dt.triceps)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = triceps ~ ns(age, knots = knot_value_manual_3), 
##     data = dt.triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7220  -1.7640  -0.3985   1.1908  22.9684 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                             8.8748     0.3742  23.714  < 2e-16 ***
## ns(age, knots = knot_value_manual_3)1   7.0119     0.6728  10.422  < 2e-16 ***
## ns(age, knots = knot_value_manual_3)2   6.0762     0.8625   7.045 3.72e-12 ***
## ns(age, knots = knot_value_manual_3)3   2.0780     1.0632   1.954    0.051 .  
## ns(age, knots = knot_value_manual_3)4   8.8616     0.9930   8.924  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.762 on 887 degrees of freedom
## Multiple R-squared:  0.4176, Adjusted R-squared:  0.415 
## F-statistic:   159 on 4 and 887 DF,  p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black")+
      stat_smooth(method = "lm", 
                 formula = y~ns(x, knots = knot_value_manual_3), 
                 lty = 1,se=F) #ns = pemulusan dari masing-masing ujung kiri dan ujung kanan

Smoothing Spline

Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R.

model_sms <- with(data = triceps,smooth.spline(age,triceps))
model_sms 
## Call:
## smooth.spline(x = age, y = triceps)
## 
## Smoothing Parameter  spar= 0.5162704  lambda= 1.232259e-06 (13 iterations)
## Equivalent Degrees of Freedom (Df): 50.00639
## Penalized Criterion (RSS): 8591.581
## GCV: 13.77835

Fungsi smooth.spline secara otomatis memilih paramter lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized Cross-Validation (GCV). Perbedaan utama antara keduanya adalah pada ukuran kebaikan model yang digunakan. Kalau CV menggunakan MSE sebagai ukuran kebaikan model, sedangkan GCV menggunakan Weighted-MSE. Banyaknya folds (lipatan) pada CV dan GCV ini adalah sejumlah obserbasinya.

pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("age")+
  ylab("triceps")+
  theme_bw()

Berdasarkan gambar di atas, garis biru menunjukkan pemulusan dari beberapa bagian.

Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_sms_lambda <- data.frame(lambda=seq(0,5,by=0.5)) %>% 
  group_by(lambda) %>% 
  do(broom::augment(with(data = triceps,smooth.spline(age,triceps,lambda = .$lambda))))

p <- ggplot(model_sms_lambda,
       aes(x=x,y=y))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~lambda)
p

Ketika nilai lamda semakin kecil, model tersebut semakin fitted dengan pola data (lebih kompleks) sehingga pemulusannya akan lebih banyak. Sedangkan, nilai lamda semakin besar pola data akan cenderung semakin linear (tidak ada faktor penalty/pemulusan).

Jika kita tentukan df=7, maka hasil kurva model smooth.spline akan lebih merepresentasikan data

model_sms <- with(data = triceps,smooth.spline(age,triceps,df=7))
model_sms 
## Call:
## smooth.spline(x = age, y = triceps, df = 7)
## 
## Smoothing Parameter  spar= 1.049874  lambda= 0.008827582 (11 iterations)
## Equivalent Degrees of Freedom (Df): 7.000747
## Penalized Criterion (RSS): 10112.16
## GCV: 14.20355

Berdasarkan hasil di atas, diperoleh bahwa terdapat perbedaan pada nilai GCV, yaitu pada sebelumnya sebesar 13.77835 dan yang saat ini sebesar 14.20355. Oleh karena itu, dapat disimpulkan model smoothing spline yang lebih baik adalah model sebelumnya karena menghasilkan nilai GCV yang lebih kecil.

pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("age")+
  ylab("triceps")+
  theme_bw()

MATERI TAMBAHAN

LOESS

Masih menggunakan data triceps seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().

model_loess <- loess(triceps~age,
                     data = triceps)
summary(model_loess)
## Call:
## loess(formula = triceps ~ age, data = triceps)
## 
## Number of Observations: 892 
## Equivalent Number of Parameters: 4.6 
## Residual Standard Error: 3.777 
## Trace of smoother matrix: 5.01  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_loess_span <- data.frame(span=seq(0.1,5,by=0.5)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(triceps~age,
                     data = triceps,span=.$span)))

p2 <- ggplot(model_loess_span,
       aes(x=age,y=triceps))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~span)
p2

Pendekatan ini juga dapat dilakukan dengan fungsi stat_smooth() pada package ggplot2.

library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
               formula=y~x,
              span = 0.75,
              col="blue",
              lty=1,
              se=F)

Tuning span dapat dilakukan dengan menggunakan cross-validation:

set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(triceps ~ age,span = i,
         data=triceps[x$in_id,])
pred <- predict(mod,
               newdata=triceps[-x$in_id,])
truth <- triceps[-x$in_id,]$triceps

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)

best_loess <- cbind(span=span,best_loess)
# menampilkan hasil all breaks
best_loess
##         span     rmse      mae
## 1  0.1000000 3.773822 2.495752
## 2  0.1183673 3.771342 2.493506
## 3  0.1367347 3.766450 2.487557
## 4  0.1551020 3.757385 2.480571
## 5  0.1734694 3.749539 2.473494
## 6  0.1918367 3.746288 2.470915
## 7  0.2102041 3.742741 2.467990
## 8  0.2285714 3.740397 2.465388
## 9  0.2469388 3.737724 2.463311
## 10 0.2653061 3.737105 2.462252
## 11 0.2836735 3.735645 2.460514
## 12 0.3020408 3.734522 2.459225
## 13 0.3204082 3.733298 2.457694
## 14 0.3387755 3.732538 2.456280
## 15 0.3571429 3.732082 2.455452
## 16 0.3755102 3.731261 2.454642
## 17 0.3938776 3.730882 2.454293
## 18 0.4122449 3.730585 2.454291
## 19 0.4306122 3.730351 2.454869
## 20 0.4489796 3.730717 2.456222
## 21 0.4673469 3.730831 2.456768
## 22 0.4857143 3.731387 2.458307
## 23 0.5040816 3.732091 2.460189
## 24 0.5224490 3.732392 2.461811
## 25 0.5408163 3.733609 2.464543
## 26 0.5591837 3.734393 2.467180
## 27 0.5775510 3.735554 2.470517
## 28 0.5959184 3.736679 2.473012
## 29 0.6142857 3.738227 2.476519
## 30 0.6326531 3.741401 2.476887
## 31 0.6510204 3.742660 2.479842
## 32 0.6693878 3.744336 2.483581
## 33 0.6877551 3.746651 2.488792
## 34 0.7061224 3.748708 2.492180
## 35 0.7244898 3.750734 2.495218
## 36 0.7428571 3.752815 2.498359
## 37 0.7612245 3.753998 2.504175
## 38 0.7795918 3.755981 2.511421
## 39 0.7979592 3.756794 2.514375
## 40 0.8163265 3.759456 2.522994
## 41 0.8346939 3.767892 2.543250
## 42 0.8530612 3.777168 2.557863
## 43 0.8714286 3.782407 2.566972
## 44 0.8897959 3.788428 2.578042
## 45 0.9081633 3.797002 2.595698
## 46 0.9265306 3.803259 2.609852
## 47 0.9448980 3.809478 2.623386
## 48 0.9632653 3.824192 2.655691
## 49 0.9816327 3.835596 2.679155
## 50 1.0000000 3.860514 2.718911
#berdasarkan rmse
best_loess %>% slice_min(rmse)
##        span     rmse      mae
## 1 0.4306122 3.730351 2.454869
#berdasarkan mae
best_loess %>% slice_min(mae)
##        span     rmse      mae
## 1 0.4122449 3.730585 2.454291
library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.4306122,
              col="blue",
              lty=1,
              se=F)

library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.41,
              col="blue",
              lty=1,
              se=F)

LATIHAN

DATA AUTO

Data Auto diperoleh dari package ISLR berisikan dataset yang digunakan pada buku “An Introduction to Statistical Learning with Applications in R”. Data Auto yang digunakan pada praktikum kali ini terdiri dari 392 observasi dan 3 peubah, yaitu:

  1. mpg : miles per gallon

  2. Horsepower : engine horsepower

  3. origin : origin of car (1. American, 2. European, 3. Japanese)

Data

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
AutoData = Auto %>% select(mpg, horsepower, origin)
tibble(AutoData)
## # A tibble: 392 x 3
##      mpg horsepower origin
##    <dbl>      <dbl>  <dbl>
##  1    18        130      1
##  2    15        165      1
##  3    18        150      1
##  4    16        150      1
##  5    17        140      1
##  6    15        198      1
##  7    14        220      1
##  8    14        215      1
##  9    14        225      1
## 10    15        190      1
## # ... with 382 more rows

Eksplorasi Data

Korelasi

cor(AutoData)
##                   mpg horsepower     origin
## mpg         1.0000000 -0.7784268  0.5652088
## horsepower -0.7784268  1.0000000 -0.4551715
## origin      0.5652088 -0.4551715  1.0000000

Plot Korelasi

chart.Correlation(AutoData)

Berdasarkan plot korelasi di atas, diperoleh bahwa korelasi antara peubah Y (MPG) dan peubah X (Horsepower) bernilai negatif sebesar -0.78. Nilai korelasi antara peubah Y (MPG) dengan peubah Origin bernilai positif sebesar 0.57. Sedangkan, nilai korelasi antara peubah X (Horsepower) dengan peubah Origin bernilai negatif sebesar -0.46.

Regresi Linear

# Regresi Linear
mod.lin.auto <- lm(mpg~horsepower, data=AutoData)
summary(mod.lin.auto)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = AutoData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower,mod.lin.auto$fitted.values,col="blue")

Regresi Polynomial

# Regresi Polynomial
mod.poly.auto <- lm(AutoData$mpg~AutoData$horsepower+I(AutoData$horsepower^2))
summary(mod.poly.auto)
## 
## Call:
## lm(formula = AutoData$mpg ~ AutoData$horsepower + I(AutoData$horsepower^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              56.9000997  1.8004268   31.60   <2e-16 ***
## AutoData$horsepower      -0.4661896  0.0311246  -14.98   <2e-16 ***
## I(AutoData$horsepower^2)  0.0012305  0.0001221   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16
# Sorting
ix<-sort(AutoData$horsepower,index.return=T)$ix

# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower[ix],mod.poly.auto$fitted.values[ix],col="blue")

Regresi Fungsi Tangga

# Formatting
range(AutoData$horsepower)
## [1]  46 230
c1 <- as.factor(ifelse(AutoData$horsepower<=100,1,0))
c2 <- as.factor(ifelse(AutoData$horsepower<=200,1,0))
c3 <- as.factor(ifelse(AutoData$horsepower>200,1,0))
head(data.frame(AutoData$mpg,c1,c2,c3))
##   AutoData.mpg c1 c2 c3
## 1           18  0  1  0
## 2           15  0  1  0
## 3           18  0  1  0
## 4           16  0  1  0
## 5           17  0  1  0
## 6           15  0  1  0
# Regresi Fungsi Tangga
mod.step.auto <- lm(mpg~c1+c2+c3, data=AutoData)
summary(mod.step.auto)
## 
## Call:
## lm(formula = mpg ~ c1 + c2 + c3, data = AutoData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5917  -3.7167  -0.5917   3.4083  19.0083 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.9000     1.8132   7.114 5.44e-12 ***
## c11          10.5589     0.6089  17.342  < 2e-16 ***
## c21           4.1329     1.8769   2.202   0.0283 *  
## c31               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.734 on 389 degrees of freedom
## Multiple R-squared:  0.4631, Adjusted R-squared:  0.4603 
## F-statistic: 167.7 on 2 and 389 DF,  p-value: < 2.2e-16
# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot Antara MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower[ix],mod.step.auto$fitted.values[ix],col="blue")

Perbandingan Model

Nilai MSE

# Fungsi MSE
MSE = function(pred,actual){
  mean((pred-actual)^2)
}

# Perbandingan
MSEAuto <- rbind(MSE(predict(mod.lin.auto),AutoData$mpg),
              MSE(predict(mod.poly.auto),AutoData$mpg),
              MSE(predict(mod.step.auto),AutoData$mpg))
ModelAuto <- c("Regresi Linear","Regresi Polynomial","Regresi Fungsi Tangga")
data.frame("Model"=ModelAuto,"MSE"=MSEAuto)
##                   Model      MSE
## 1        Regresi Linear 23.94366
## 2    Regresi Polynomial 18.98477
## 3 Regresi Fungsi Tangga 32.62641

Nilai AIC

AICAuto <- data.frame("Model"=c("Linear","Polynomial","Fungsi Tangga"),
                  "AIC"=c(AIC(mod.lin.auto),AIC(mod.poly.auto),AIC(mod.step.auto)))
AICAuto
##           Model      AIC
## 1        Linear 2363.324
## 2    Polynomial 2274.354
## 3 Fungsi Tangga 2486.616

Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.

Cross Validation Evaluation

Regresi Linear

set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(mpg ~ horsepower,data=AutoData[x$in_id,])
    pred <- predict(mod,newdata=AutoData[-x$in_id,])
    truth <- AutoData[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  4.45  3.49
## 2  5.42  4.22
## 3  4.80  3.64
## 4  4.71  3.89
## 5  5.11  3.94
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
##     rmse      mae 
## 4.898592 3.835773

Regresi Polynomial Derajat 2 (Ordo=2)

set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(mpg ~ poly(horsepower,2,raw =   T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$mpg
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly2
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  3.80  2.78
## 2  4.78  3.65
## 3  4.19  3.08
## 4  4.01  3.13
## 5  5.07  3.78
# menghitung rata-rata untuk 5 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
##     rmse      mae 
## 4.370631 3.284748

Regresi Polynomial Derajat 3 (Ordo=3)

set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")

metric_poly3 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(mpg ~ poly(horsepower,3,raw = T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$mpg
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly3
## # A tibble: 5 x 2
##    rmse   mae
##   <dbl> <dbl>
## 1  3.79  2.76
## 2  4.78  3.64
## 3  4.20  3.09
## 4  4.00  3.12
## 5  5.09  3.79
# menghitung rata-rata untuk 5 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
##     rmse      mae 
## 4.372623 3.280150

Regresi Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- AutoData[x$in_id,]
        training$horsepower <- cut(training$horsepower,i)
        mod <- lm(mpg ~ horsepower,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- AutoData[-x$in_id,]
        horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(horsepower=horsepower_new))
        truth <- testing$mpg
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##   breaks     rmse      mae
## 1      3 5.788482 4.522081
## 2      4 5.026946 3.778564
## 3      5 4.737723 3.588164
## 4      6 4.728788 3.573371
## 5      7 4.590146 3.411292
## 6      8 4.492694 3.419407
## 7      9 4.582439 3.494722
## 8     10 4.609092 3.476580

Perbandingan Model

Nilai RMSE dan MAE

nilai_metricAuto <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      mean_metric_poly3,
                      best_tangga %>% select(-1) %>% slice_min(mae))

nama_modelAuto <- c("Linear","Polynomial (Ordo=2)","Polynomial (Ordo=3)","Fungsi Tangga (Breaks=9)")
data.frame("Model"=nama_modelAuto,nilai_metricAuto)
##                      Model     rmse      mae
## 1                   Linear 4.898592 3.835773
## 2      Polynomial (Ordo=2) 4.370631 3.284748
## 3      Polynomial (Ordo=3) 4.372623 3.280150
## 4 Fungsi Tangga (Breaks=9) 4.590146 3.411292

Berdasarkan nilai RMSE, diperoleh bahwa Model Regresi Polynomial (Ordo=2) menghasilkan nilai RMSE yang paling kecil dibandingkan dengan model lainnya. Sedangkan, berdasarkan nilai MAE, diperoleh bahwa Model Regresi Polynomial (Ordo=3) menghasilkan nilai MAE yang paling kecil dibandingkan dengan model lainnya.