Initial Library

library(ElemStatLearn)
library(corrplot)
## corrplot 0.92 loaded
library(ISLR)
library(knitr)
library(splines)
library(rsample)
library(cowplot)
library(caret) # cross-validation
## Loading required package: ggplot2
## Loading required package: lattice
library(gridExtra) # combining graphs
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
library(DT)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gam) # generalized additive models
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.22-1
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin

Load Data

data <- prostate
head(data)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE

Menampilkan 5 data pertama

head(data,5)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
str(data)
## 'data.frame':    97 obs. of  10 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
##  $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
print(summary(data))
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.629   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.876   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :4.780   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1787   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa           train        
##  Min.   :-0.4308   Mode :logical  
##  1st Qu.: 1.7317   FALSE:30       
##  Median : 2.5915   TRUE :67       
##  Mean   : 2.4784                  
##  3rd Qu.: 3.0564                  
##  Max.   : 5.5829

Mengecek apakah ada data missing

sum(is.na(data))
## [1] 0

Plot Korelasi antar peubah prediktor

prostate.corr <- cor(data)
corrplot.mixed(prostate.corr)

cor(data)
##              lcavol      lweight       age         lbph         svi
## lcavol   1.00000000  0.280521386 0.2249999  0.027349703  0.53884500
## lweight  0.28052139  1.000000000 0.3479691  0.442264395  0.15538491
## age      0.22499988  0.347969112 1.0000000  0.350185896  0.11765804
## lbph     0.02734970  0.442264395 0.3501859  1.000000000 -0.08584324
## svi      0.53884500  0.155384906 0.1176580 -0.085843238  1.00000000
## lcp      0.67531048  0.164537146 0.1276678 -0.006999431  0.67311118
## gleason  0.43241706  0.056882099 0.2688916  0.077820447  0.32041222
## pgg45    0.43365225  0.107353790 0.2761124  0.078460018  0.45764762
## lpsa     0.73446033  0.433319385 0.1695928  0.179809404  0.56621822
## train   -0.04654347 -0.009940651 0.1776155 -0.029939957  0.02679950
##                  lcp     gleason      pgg45        lpsa        train
## lcavol   0.675310484  0.43241706 0.43365225  0.73446033 -0.046543468
## lweight  0.164537146  0.05688210 0.10735379  0.43331939 -0.009940651
## age      0.127667752  0.26889160 0.27611245  0.16959284  0.177615517
## lbph    -0.006999431  0.07782045 0.07846002  0.17980940 -0.029939957
## svi      0.673111185  0.32041222 0.45764762  0.56621822  0.026799505
## lcp      1.000000000  0.51483006 0.63152825  0.54881317 -0.037427296
## gleason  0.514830063  1.00000000 0.75190451  0.36898681 -0.044171456
## pgg45    0.631528246  0.75190451 1.00000000  0.42231586  0.100516371
## lpsa     0.548813175  0.36898681 0.42231586  1.00000000 -0.033889743
## train   -0.037427296 -0.04417146 0.10051637 -0.03388974  1.000000000
plot(data$age, data$lpsa,pch=19, col="#0000ff",
     xlab="age", ylab="lpsa")

Cross validation untuk menghasilkan pemodelan lpsa vs age optimal dengan regresi polinomial.

set.seed(123)

cv.poly1 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
  metric_poly1 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(lpsa ~ poly(age,1,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$lpsa
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(RegLin) 
}
cv.poly2 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
  metric_poly2 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(lpsa ~ poly(age,2,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$lpsa
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
  return(poly2) 
}
cv.poly3 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
  metric_poly3 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(lpsa ~ poly(age,3,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$lpsa
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly3)
    }
cv.poly4 <- function(dataset){
  set.seed(123)
  cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
  metric_poly4 <- map_dfr(cross_val$splits,
    function(x){  
      mod <- lm(lpsa ~ poly(age,4,raw = T),
         data=dataset[x$in_id,]) 
      pred <- predict(mod,
               newdata=dataset[-x$in_id,]) 
      truth <- dataset[-x$in_id,]$lpsa
      rmse <- mlr3measures::rmse(truth = truth,response = pred)
      mae <- mlr3measures::mae(truth = truth,response = pred)
      metric <- c(rmse,mae) 
      names(metric) <- c("rmse","mae")
      return(metric)
      }
  )
  poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
  return(poly4) 
}
cvpoly <- rbind (cv.poly1(data),cv.poly2(data),cv.poly3(data),cv.poly4(data))
cvpoly
##          rmse       mae
## [1,] 1.165938 0.9076820
## [2,] 1.163891 0.9034202
## [3,] 1.184469 0.9099577
## [4,] 1.233204 0.9416675
poly1 <- ggplot(data,aes(x=age, y=lpsa)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,1,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()


poly2 <- ggplot(data,aes(x=age, y=lpsa)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly3 <- ggplot(data,aes(x=age, y=lpsa)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

poly4 <- ggplot(data,aes(x=age, y=lpsa)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,4,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)

Natural Spline

Menentukan fungsi basis yang akan digunakan

set.seed(123)
cross.val <- vfold_cv(data,v=5,strata = "lpsa")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
                  function(x){
                  mod <- lm(lpsa ~ ns(age,df=i),data=data[x$in_id,])
                  pred <- predict(mod,newdata=data[-x$in_id,])
                  truth <- data[-x$in_id,]$lpsa
                  rmse <- mlr3measures::rmse(truth = truth,response = pred)
                  mae <- mlr3measures::mae(truth = truth,response = pred)
                  metric <- c(rmse,mae)
                  names(metric) <- c("rmse","mae")
                  return(metric)
                }
      )
    metric.spline3
    mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
    mean.metric.spline3
  }
)

best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
                 best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis
##   df     rmse       mae
## 1  3 1.164951 0.9015556
## 2  3 1.164951 0.9015556

Banyaknya fungsi basis terpilih yaitu 3 fungsi basis. Menentukan banyak nya knot berdasarkan 3 fungsi basis yang dibuat

df_3 <- attr(ns(data$age, df=3),"knots")
df_3
## 33.33333% 66.66667% 
##        62        68

Cross Validation

set.seed(123)

cross.val <- vfold_cv(data,v=5,strata = "lpsa")

metric.spline3.ns2 <- map_dfr(cross.val$splits,
    function(x){
mod <- lm(lpsa ~ ns(age, knots = c(62, 68)),
         data=data[x$in_id,])
pred <- predict(mod,
               newdata=data[-x$in_id,])
truth <- data[-x$in_id,]$lpsa

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds

nama.model.ncs <- "2 - NCS Knots 62, 68 (df=3)"

model.ncs<-data.frame(Model = nama.model.ncs, mean.metric.spline3.ns2)

model.ncs
##                            Model mean.metric.spline3.ns2
## rmse 2 - NCS Knots 62, 68 (df=3)               1.1654699
## mae  2 - NCS Knots 62, 68 (df=3)               0.9027996
nc2 <- ggplot(data, aes(x = age, y = lpsa)) + 
  geom_point(alpha = 0.55, color="black") + 
  stat_smooth(method = "lm", formula = y~ns(x, knots = c(62, 68)), col = "blue", se = F) +
  theme_bw()
plot_grid(nc2, labels = c("nc2"), label_size = 10)

model.ncs
##                            Model mean.metric.spline3.ns2
## rmse 2 - NCS Knots 62, 68 (df=3)               1.1654699
## mae  2 - NCS Knots 62, 68 (df=3)               0.9027996

Generalize Additive Model

library(gam)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ stringr 1.5.0
## ✔ tidyr   1.3.0     ✔ forcats 1.0.0
## ✔ readr   2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ foreach::accumulate()    masks purrr::accumulate()
## ✖ randomForest::combine()  masks dplyr::combine(), gridExtra::combine()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ randomForest::margin()   masks ggplot2::margin()
## ✖ foreach::when()          masks purrr::when()
library(mlr3verse)
## Loading required package: mlr3
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
## 
## Attaching package: 'mlr3measures'
## 
## The following object is masked from 'package:mlr3verse':
## 
##     tnr
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(splines)
data <- data %>% 
  select_if(is.numeric) %>% 
  select(-svi, -gleason) 
head(data)
##       lcavol  lweight age      lbph       lcp pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294 -1.386294     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294 -1.386294     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294 -1.386294    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294 -1.386294     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294 -1.386294     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294 -1.386294     0  0.7654678
formula_spline1 <- str_c("s(",
            names(data %>% select(-lpsa)),
            ",df=",6,")",
            collapse = "+")
formula_spline1 <- as.formula(str_c("lpsa~",formula_spline1))
mod_spline1 <- gam(formula_spline1,data=data,family = "gaussian")
summary(mod_spline1)
## 
## Call: gam(formula = formula_spline1, family = "gaussian", data = data)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20659 -0.27978 -0.02599  0.30743  1.21821 
## 
## (Dispersion Parameter for gaussian family taken to be 0.4454)
## 
##     Null Deviance: 127.9177 on 96 degrees of freedom
## Residual Deviance: 26.7268 on 60.0006 degrees of freedom
## AIC: 226.2356 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                        Df Sum Sq Mean Sq  F value    Pr(>F)    
## s(lcavol, df = 6)   1.000 73.956  73.956 166.0283 < 2.2e-16 ***
## s(lweight, df = 6)  1.000  9.572   9.572  21.4890 1.969e-05 ***
## s(age, df = 6)      1.000  1.151   1.151   2.5830   0.11326    
## s(lbph, df = 6)     1.000  1.332   1.332   2.9895   0.08895 .  
## s(lcp, df = 6)      1.000  0.530   0.530   1.1905   0.27959    
## s(pgg45, df = 6)    1.000  2.076   2.076   4.6606   0.03487 *  
## Residuals          60.001 26.727   0.445                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                    Npar Df Npar F   Pr(F)  
## (Intercept)                                
## s(lcavol, df = 6)        5 1.2698 0.28879  
## s(lweight, df = 6)       5 1.8641 0.11413  
## s(age, df = 6)           5 1.1354 0.35168  
## s(lbph, df = 6)          5 1.2182 0.31172  
## s(lcp, df = 6)           5 2.0645 0.08241 .
## s(pgg45, df = 6)         5 1.9241 0.10357  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
formula_loess <- str_c("lo(",
            names(data %>% select(-lpsa)),
            ")",
            collapse = "+")
formula_loess <- as.formula(str_c("lpsa~",formula_loess))
mod_loess <- gam(formula_loess,data=data,family = "gaussian")
summary(mod_loess)
## 
## Call: gam(formula = formula_loess, family = "gaussian", data = data)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21683 -0.31661 -0.05531  0.33127  1.58643 
## 
## (Dispersion Parameter for gaussian family taken to be 0.4708)
## 
##     Null Deviance: 127.9177 on 96 degrees of freedom
## Residual Deviance: 33.7462 on 71.6827 degrees of freedom
## AIC: 225.4919 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## lo(lcavol)   1.000 70.139  70.139 148.9879 < 2.2e-16 ***
## lo(lweight)  1.000  8.634   8.634  18.3390 5.645e-05 ***
## lo(age)      1.000  0.967   0.967   2.0541   0.15614    
## lo(lbph)     1.000  1.058   1.058   2.2468   0.13828    
## lo(lcp)      1.000  0.294   0.294   0.6254   0.43166    
## lo(pgg45)    1.000  2.230   2.230   4.7374   0.03281 *  
## Residuals   71.683 33.746   0.471                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df  Npar F   Pr(F)  
## (Intercept)                          
## lo(lcavol)      3.0 1.50321 0.22110  
## lo(lweight)     3.5 1.78736 0.14804  
## lo(age)         2.8 1.02817 0.38235  
## lo(lbph)        3.6 0.99659 0.40935  
## lo(lcp)         2.8 2.49655 0.07014 .
## lo(pgg45)       2.5 1.91416 0.14385  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_linear <- glm(lpsa~.,data=data,family = "gaussian")
summary(mod_linear)
## 
## Call:
## glm(formula = lpsa ~ ., family = "gaussian", data = data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.45705  -0.39398  -0.02834   0.42085   1.79805  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.356988   0.914975   0.390  0.69734    
## lcavol       0.604602   0.089272   6.773 1.26e-09 ***
## lweight      0.668865   0.207327   3.226  0.00175 ** 
## age         -0.019940   0.011508  -1.733  0.08657 .  
## lbph         0.069019   0.059651   1.157  0.25032    
## lcp          0.012591   0.085185   0.148  0.88283    
## pgg45        0.006018   0.003542   1.699  0.09277 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.5326767)
## 
##     Null deviance: 127.918  on 96  degrees of freedom
## Residual deviance:  47.941  on 90  degrees of freedom
## AIC: 222.91
## 
## Number of Fisher Scoring iterations: 2
task_prostate <- TaskRegr$new(id = "age",backend = data,target = "lpsa")
rcv = rsmp("cv",folds=10)
set.seed(123)
index_rcv <- rcv$instantiate(task_prostate)$instance
head(index_rcv)
##    row_id fold
## 1:      1    1
## 2:      3    1
## 3:     27    1
## 4:     47    1
## 5:     49    1
## 6:     51    1
gam_all <- function(train,test,mape=FALSE,response){
#GAM Spline
formula_spline1 <- str_c("s(",
            names(data %>% select(-lpsa)),
            ",df=",6,")",
            collapse = "+")
formula_spline1 <- as.formula(str_c("lpsa~",formula_spline1))
mod_spline1 <- gam(formula_spline1,data=train,family = "gaussian")
pred_spline1 <- predict(mod_spline1,newdata=test)
rmse_spline1<- mlr3measures::rmse(response = pred_spline1,
                                truth = test %>% pull(response))
mape_spline1 <- mlr3measures::mape(response = pred_spline1,
                                truth = test %>% pull(response))

# GAM LOESS
formula_loess <- str_c("lo(",
            names(data %>% select(-lpsa)),
            ")",
            collapse = "+")
formula_loess <- as.formula(str_c("lpsa~",formula_loess))

mod_loess <- gam(formula_loess,data=train,family = "gaussian")

pred_loess <- predict(mod_loess,newdata=test)
rmse_loess <- mlr3measures::rmse(response = pred_loess,
                                truth = test %>% pull(response))
mape_loess <- mlr3measures::mape(response = pred_loess,
                                truth = test %>% pull(response))

# Regresi Linear
mod_linear <- glm(lpsa~.,data=data,family = "gaussian")

pred_linear <- predict(mod_linear,newdata=test)
rmse_linear <- mlr3measures::rmse(response = pred_linear,
                                truth = test %>% pull(response))
mape_linear <- mlr3measures::mape(response = pred_linear,
                                truth = test %>% pull(response))


mape_all = c(mape_linear,mape_loess,mape_spline1)
names(mape_all) <- c("Linear","LOESS","Spline")
rmse_all = c(rmse_linear,rmse_loess,rmse_spline1)
names(rmse_all) <- c("Linear","LOESS","Spline")
if(mape){
  
  return(mape_all)
}else{
  return(rmse_all)
}

}
score_all <- map_df(1:10,function(i){
  index_train <- index_rcv %>% 
    filter(fold!=i) %>%
    pull(row_id)
    index_test <- index_rcv %>% 
    filter(fold==i) %>%
    pull(row_id)
  score <- gam_all(response="lpsa",train = data[index_train,],
                   test = data[index_test,],
                   mape = TRUE
                   )
return(score)
})
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## lo.wam convergence not obtained in 30 iterations
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 79
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 78.185
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 2.6568
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 2.6038
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 2.9042
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 2.6038
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : eval 100
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : upperlimit 95.475
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 3.821
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : upperlimit 3.4961
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 2.3749
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : lowerlimit 2.6808
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 41
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## lowerlimit 42.82
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval -1.3471
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : lowerlimit -1.2291
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 4.7804
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : upperlimit 4.7298
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
colMeans(score_all,na.rm = TRUE)
##    Linear     LOESS    Spline 
## 0.4235063 0.5201586 0.5389620

Pemodelan data Bone Mineral Density dengan Regression Tree + Bagging, Random Forest, dan Gradient Boosting

library(rsample)     # data splitting 
library(dplyr)       # data wrangling
library(rpart)       # performing regression trees
library(rpart.plot)  # plotting regression trees
library(ipred)       # bagging
library(caret)       # bagging
Bone_raw <- read.csv("bone.csv", stringsAsFactors = FALSE)
summary(Bone_raw)
##      idnum          ethnic               age            sex           
##  Min.   :  1.0   Length:1003        Min.   : 8.80   Length:1003       
##  1st Qu.: 84.5   Class :character   1st Qu.:12.80   Class :character  
##  Median :181.0   Mode  :character   Median :15.70   Mode  :character  
##  Mean   :189.9                      Mean   :16.32                     
##  3rd Qu.:287.5                      3rd Qu.:19.45                     
##  Max.   :429.0                      Max.   :26.20                     
##      spnbmd      
##  Min.   :0.5360  
##  1st Qu.:0.7995  
##  Median :0.9650  
##  Mean   :0.9476  
##  3rd Qu.:1.0705  
##  Max.   :1.4430
str(Bone_raw)
## 'data.frame':    1003 obs. of  5 variables:
##  $ idnum : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ ethnic: chr  "White" "White" "White" "White" ...
##  $ age   : num  11.2 12.2 13.2 14.3 12.7 13.8 14.8 15.8 10.9 11.9 ...
##  $ sex   : chr  "mal" "mal" "mal" "mal" ...
##  $ spnbmd: num  0.719 0.732 0.776 0.781 0.62 0.627 0.759 0.79 0.641 0.622 ...
head(Bone_raw)
##   idnum ethnic  age sex spnbmd
## 1     1  White 11.2 mal  0.719
## 2     1  White 12.2 mal  0.732
## 3     1  White 13.2 mal  0.776
## 4     1  White 14.3 mal  0.781
## 5     2  White 12.7 mal  0.620
## 6     2  White 13.8 mal  0.627
Bone <- subset(Bone_raw,select=-c(idnum))
head(Bone)
##   ethnic  age sex spnbmd
## 1  White 11.2 mal  0.719
## 2  White 12.2 mal  0.732
## 3  White 13.2 mal  0.776
## 4  White 14.3 mal  0.781
## 5  White 12.7 mal  0.620
## 6  White 13.8 mal  0.627
summary(Bone)
##     ethnic               age            sex                spnbmd      
##  Length:1003        Min.   : 8.80   Length:1003        Min.   :0.5360  
##  Class :character   1st Qu.:12.80   Class :character   1st Qu.:0.7995  
##  Mode  :character   Median :15.70   Mode  :character   Median :0.9650  
##                     Mean   :16.32                      Mean   :0.9476  
##                     3rd Qu.:19.45                      3rd Qu.:1.0705  
##                     Max.   :26.20                      Max.   :1.4430
str(Bone)
## 'data.frame':    1003 obs. of  4 variables:
##  $ ethnic: chr  "White" "White" "White" "White" ...
##  $ age   : num  11.2 12.2 13.2 14.3 12.7 13.8 14.8 15.8 10.9 11.9 ...
##  $ sex   : chr  "mal" "mal" "mal" "mal" ...
##  $ spnbmd: num  0.719 0.732 0.776 0.781 0.62 0.627 0.759 0.79 0.641 0.622 ...
# Split data
set.seed(123)
bone_split <- createDataPartition(Bone$spnbmd, p = .7, list = FALSE)
bone_train <- Bone[bone_split, ]
bone_test  <- Bone[-bone_split, ]

Regression Tree

m1 <- rpart(
  formula = spnbmd ~ ., 
  data    = bone_train, 
  method  = "anova" 
  )
rpart.plot(m1) 

plotcp(m1)

m1$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.43542953      0 1.0000000 1.0005783 0.04483880
## 2 0.03755374      1 0.5645705 0.6096417 0.03699330
## 3 0.03317541      2 0.5270167 0.5618151 0.03299311
## 4 0.02730405      3 0.4938413 0.5319004 0.03017752
## 5 0.02078144      4 0.4665373 0.5246121 0.02972519
## 6 0.01912976      5 0.4457558 0.5126399 0.02884200
## 7 0.01000000      6 0.4266261 0.4843469 0.02743718
hyper_grid <- expand.grid(
  minsplit = seq(1, 20, 1),  
  maxdepth = seq(1, 15, 1) 
)

models <- vector(mode = "list", nrow(hyper_grid))

for (i in 1:nrow(hyper_grid)){
  minsplit <- hyper_grid$minsplit[i]
  maxdepth <- hyper_grid$maxdepth[i]
  models[[i]] <- rpart(
    formula = spnbmd ~ .,
    data    = bone_train,
    method  = "anova",
    control = list(minsplit = minsplit, maxdepth = maxdepth)
    )
}
get_cp <- function(x) {
  min    <- which.min(x$cptable[, "xerror"])
  cp <- x$cptable[min, "CP"] 
}

get_min_error <- function(x) {
  min    <- which.min(x$cptable[, "xerror"])
  xerror <- x$cptable[min, "xerror"] 
}

hyper_grid %>%
  mutate(
    cp    = purrr::map_dbl(models, get_cp),
    error = purrr::map_dbl(models, get_min_error)
    ) %>%
  arrange(error) %>%
  top_n(-5, wt = error)
##   minsplit maxdepth   cp     error
## 1        6        9 0.01 0.4492643
## 2        5        3 0.01 0.4512283
## 3       19       12 0.01 0.4544440
## 4        8       14 0.01 0.4547218
## 5        4        9 0.01 0.4552722
optimal_tree <- rpart(
    formula = spnbmd ~ .,
    data    = bone_train,
    method  = "anova",
    control = list(minsplit = 15, maxdepth = 10, cp = 0.01)
    )

pred <- predict(optimal_tree, newdata = bone_test)
RMSE(pred = pred, obs = bone_test$spnbmd)
## [1] 0.132335
MAE(pred = pred, obs = bone_test$spnbmd)
## [1] 0.1018731

Bagging

set.seed(123)
bagged_m1 <- bagging(
  formula = spnbmd ~ .,
  data    = bone_train,
  coob    = TRUE
)

bagged_m1
## 
## Bagging regression trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = spnbmd ~ ., data = bone_train, coob = TRUE)
## 
## Out-of-bag estimate of root mean squared error:  0.1234
ntree <- 10:50

rmse <- vector(mode = "numeric", length = length(ntree))

for (i in seq_along(ntree)) {
  set.seed(123)
  model <- bagging(
  formula = spnbmd ~ .,
  data    = bone_train,
  coob    = TRUE,
  nbagg   = ntree[i]
)
  rmse[i] <- model$err
}

plot(ntree, rmse, type = 'l', lwd = 2)
abline(v = 25, col = "red", lty = "dashed")

# Cross validation 10 K-fold
ctrl <- trainControl(method = "cv",  number = 10) 

# CV bagged model
bagged_cv <- train(
  spnbmd ~ .,
  data = bone_train,
  method = "treebag",
  trControl = ctrl,
  importance = TRUE
  )

# assess results
bagged_cv
## Bagged CART 
## 
## 703 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 633, 632, 632, 634, 634, 631, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1226359  0.5733349  0.0970862
plot(varImp(bagged_cv), 2)  

pred <- predict(bagged_cv, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1235616
MAE(pred, bone_test$spnbmd)
## [1] 0.09727683

Random Forest

library(randomForest)
mod_forest<-train(spnbmd~., data=bone_train, method="rf", prox=TRUE)
mod_forest
## Random Forest 
## 
## 703 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 703, 703, 703, 703, 703, 703, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##   2     0.1242096  0.5722359  0.09798122
##   3     0.1258679  0.5492926  0.09867422
##   5     0.1387192  0.4803627  0.10906588
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
pred <- predict(mod_forest, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1209239
MAE(pred, bone_test$spnbmd)
## [1] 0.09754176

Gradient Boosting

library(mlbench)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
set.seed(1234)
gbmFit1 <- train(spnbmd~., data=bone_train, method="gbm", trControl = fitControl, verbose=F)
gbmFit1$results[which.max(gbmFit1$result[,4]),] # best result
##   shrinkage interaction.depth n.minobsinnode n.trees     RMSE Rsquared
## 3       0.1                 1             10     150 0.122378 0.575094
##         MAE     RMSESD RsquaredSD      MAESD
## 3 0.0967561 0.01048381 0.06016369 0.00869704
summary(gbmFit1)

##                           var    rel.inf
## age                       age 82.5082846
## sexmal                 sexmal 10.5073778
## ethnicBlack       ethnicBlack  5.4069137
## ethnicHispanic ethnicHispanic  0.8476221
## ethnicWhite       ethnicWhite  0.7298017
pred <- predict(gbmFit1, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1201753
MAE(pred, bone_test$spnbmd)
## [1] 0.09563929