Initial Library
library(ElemStatLearn)
library(corrplot)
## corrplot 0.92 loaded
library(ISLR)
library(knitr)
library(splines)
library(rsample)
library(cowplot)
library(caret) # cross-validation
## Loading required package: ggplot2
## Loading required package: lattice
library(gridExtra) # combining graphs
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
library(DT)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gam) # generalized additive models
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.22-1
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
Load Data
data <- prostate
head(data)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.769459 50 -1.386294 0 -1.386294 6 0 -0.4307829
## 2 -0.9942523 3.319626 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 3 -0.5108256 2.691243 74 -1.386294 0 -1.386294 7 20 -0.1625189
## 4 -1.2039728 3.282789 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 5 0.7514161 3.432373 62 -1.386294 0 -1.386294 6 0 0.3715636
## 6 -1.0498221 3.228826 50 -1.386294 0 -1.386294 6 0 0.7654678
## train
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
Menampilkan 5 data pertama
head(data,5)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.769459 50 -1.386294 0 -1.386294 6 0 -0.4307829
## 2 -0.9942523 3.319626 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 3 -0.5108256 2.691243 74 -1.386294 0 -1.386294 7 20 -0.1625189
## 4 -1.2039728 3.282789 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 5 0.7514161 3.432373 62 -1.386294 0 -1.386294 6 0 0.3715636
## train
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
str(data)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
print(summary(data))
## lcavol lweight age lbph
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863
## 1st Qu.: 0.5128 1st Qu.:3.376 1st Qu.:60.00 1st Qu.:-1.3863
## Median : 1.4469 Median :3.623 Median :65.00 Median : 0.3001
## Mean : 1.3500 Mean :3.629 Mean :63.87 Mean : 0.1004
## 3rd Qu.: 2.1270 3rd Qu.:3.876 3rd Qu.:68.00 3rd Qu.: 1.5581
## Max. : 3.8210 Max. :4.780 Max. :79.00 Max. : 2.3263
## svi lcp gleason pgg45
## Min. :0.0000 Min. :-1.3863 Min. :6.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00
## Median :0.0000 Median :-0.7985 Median :7.000 Median : 15.00
## Mean :0.2165 Mean :-0.1794 Mean :6.753 Mean : 24.38
## 3rd Qu.:0.0000 3rd Qu.: 1.1787 3rd Qu.:7.000 3rd Qu.: 40.00
## Max. :1.0000 Max. : 2.9042 Max. :9.000 Max. :100.00
## lpsa train
## Min. :-0.4308 Mode :logical
## 1st Qu.: 1.7317 FALSE:30
## Median : 2.5915 TRUE :67
## Mean : 2.4784
## 3rd Qu.: 3.0564
## Max. : 5.5829
Mengecek apakah ada data missing
sum(is.na(data))
## [1] 0
Plot Korelasi antar peubah prediktor
prostate.corr <- cor(data)
corrplot.mixed(prostate.corr)
cor(data)
## lcavol lweight age lbph svi
## lcavol 1.00000000 0.280521386 0.2249999 0.027349703 0.53884500
## lweight 0.28052139 1.000000000 0.3479691 0.442264395 0.15538491
## age 0.22499988 0.347969112 1.0000000 0.350185896 0.11765804
## lbph 0.02734970 0.442264395 0.3501859 1.000000000 -0.08584324
## svi 0.53884500 0.155384906 0.1176580 -0.085843238 1.00000000
## lcp 0.67531048 0.164537146 0.1276678 -0.006999431 0.67311118
## gleason 0.43241706 0.056882099 0.2688916 0.077820447 0.32041222
## pgg45 0.43365225 0.107353790 0.2761124 0.078460018 0.45764762
## lpsa 0.73446033 0.433319385 0.1695928 0.179809404 0.56621822
## train -0.04654347 -0.009940651 0.1776155 -0.029939957 0.02679950
## lcp gleason pgg45 lpsa train
## lcavol 0.675310484 0.43241706 0.43365225 0.73446033 -0.046543468
## lweight 0.164537146 0.05688210 0.10735379 0.43331939 -0.009940651
## age 0.127667752 0.26889160 0.27611245 0.16959284 0.177615517
## lbph -0.006999431 0.07782045 0.07846002 0.17980940 -0.029939957
## svi 0.673111185 0.32041222 0.45764762 0.56621822 0.026799505
## lcp 1.000000000 0.51483006 0.63152825 0.54881317 -0.037427296
## gleason 0.514830063 1.00000000 0.75190451 0.36898681 -0.044171456
## pgg45 0.631528246 0.75190451 1.00000000 0.42231586 0.100516371
## lpsa 0.548813175 0.36898681 0.42231586 1.00000000 -0.033889743
## train -0.037427296 -0.04417146 0.10051637 -0.03388974 1.000000000
plot(data$age, data$lpsa,pch=19, col="#0000ff",
xlab="age", ylab="lpsa")
Cross validation untuk menghasilkan pemodelan lpsa vs age optimal dengan
regresi polinomial.
set.seed(123)
cv.poly1 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
metric_poly1 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(lpsa ~ poly(age,1,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
RegLin<- colMeans(metric_poly1) #menghitung rmse dan mae rata-rata untuk 5 folds
return(RegLin)
}
cv.poly2 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(lpsa ~ poly(age,2,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly2<- colMeans(metric_poly2) #menghitung rmse dan mae rata-rata untuk 10 folds
return(poly2)
}
cv.poly3 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(lpsa ~ poly(age,3,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly3<- colMeans(metric_poly3) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly3)
}
cv.poly4 <- function(dataset){
set.seed(123)
cross_val <- vfold_cv(dataset,v=5,strata = "lpsa")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(lpsa ~ poly(age,4,raw = T),
data=dataset[x$in_id,])
pred <- predict(mod,
newdata=dataset[-x$in_id,])
truth <- dataset[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
poly4<- colMeans(metric_poly4) #menghitung rmse dan mae rata-rata untuk 5 folds
return(poly4)
}
cvpoly <- rbind (cv.poly1(data),cv.poly2(data),cv.poly3(data),cv.poly4(data))
cvpoly
## rmse mae
## [1,] 1.165938 0.9076820
## [2,] 1.163891 0.9034202
## [3,] 1.184469 0.9099577
## [4,] 1.233204 0.9416675
poly1 <- ggplot(data,aes(x=age, y=lpsa)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly2 <- ggplot(data,aes(x=age, y=lpsa)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly3 <- ggplot(data,aes(x=age, y=lpsa)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
poly4 <- ggplot(data,aes(x=age, y=lpsa)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly1, poly2, poly3, poly4, labels=c("RegLin","poly2", "poly3", "poly4"), label_size = 6)
Natural Spline
Menentukan fungsi basis yang akan digunakan
set.seed(123)
cross.val <- vfold_cv(data,v=5,strata = "lpsa")
df <- 1:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(lpsa ~ ns(age,df=i),data=data[x$in_id,])
pred <- predict(mod,newdata=data[-x$in_id,])
truth <- data[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric.spline3
mean.metric.spline3 <- colMeans(metric.spline3) #rata-rata untuk 5 folds
mean.metric.spline3
}
)
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse), #berdasarkan rmse
best.spline3 %>% slice_min(mae))) #berdasarkan mae
basis
## df rmse mae
## 1 3 1.164951 0.9015556
## 2 3 1.164951 0.9015556
Banyaknya fungsi basis terpilih yaitu 3 fungsi basis. Menentukan banyak nya knot berdasarkan 3 fungsi basis yang dibuat
df_3 <- attr(ns(data$age, df=3),"knots")
df_3
## 33.33333% 66.66667%
## 62 68
Cross Validation
set.seed(123)
cross.val <- vfold_cv(data,v=5,strata = "lpsa")
metric.spline3.ns2 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(lpsa ~ ns(age, knots = c(62, 68)),
data=data[x$in_id,])
pred <- predict(mod,
newdata=data[-x$in_id,])
truth <- data[-x$in_id,]$lpsa
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
mean.metric.spline3.ns2 <- colMeans(metric.spline3.ns2)# menghitung rata-rata untuk 5 folds
nama.model.ncs <- "2 - NCS Knots 62, 68 (df=3)"
model.ncs<-data.frame(Model = nama.model.ncs, mean.metric.spline3.ns2)
model.ncs
## Model mean.metric.spline3.ns2
## rmse 2 - NCS Knots 62, 68 (df=3) 1.1654699
## mae 2 - NCS Knots 62, 68 (df=3) 0.9027996
nc2 <- ggplot(data, aes(x = age, y = lpsa)) +
geom_point(alpha = 0.55, color="black") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(62, 68)), col = "blue", se = F) +
theme_bw()
plot_grid(nc2, labels = c("nc2"), label_size = 10)
model.ncs
## Model mean.metric.spline3.ns2
## rmse 2 - NCS Knots 62, 68 (df=3) 1.1654699
## mae 2 - NCS Knots 62, 68 (df=3) 0.9027996
Generalize Additive Model
library(gam)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ foreach::accumulate() masks purrr::accumulate()
## ✖ randomForest::combine() masks dplyr::combine(), gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ foreach::when() masks purrr::when()
library(mlr3verse)
## Loading required package: mlr3
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
##
## Attaching package: 'mlr3measures'
##
## The following object is masked from 'package:mlr3verse':
##
## tnr
##
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(splines)
data <- data %>%
select_if(is.numeric) %>%
select(-svi, -gleason)
head(data)
## lcavol lweight age lbph lcp pgg45 lpsa
## 1 -0.5798185 2.769459 50 -1.386294 -1.386294 0 -0.4307829
## 2 -0.9942523 3.319626 58 -1.386294 -1.386294 0 -0.1625189
## 3 -0.5108256 2.691243 74 -1.386294 -1.386294 20 -0.1625189
## 4 -1.2039728 3.282789 58 -1.386294 -1.386294 0 -0.1625189
## 5 0.7514161 3.432373 62 -1.386294 -1.386294 0 0.3715636
## 6 -1.0498221 3.228826 50 -1.386294 -1.386294 0 0.7654678
formula_spline1 <- str_c("s(",
names(data %>% select(-lpsa)),
",df=",6,")",
collapse = "+")
formula_spline1 <- as.formula(str_c("lpsa~",formula_spline1))
mod_spline1 <- gam(formula_spline1,data=data,family = "gaussian")
summary(mod_spline1)
##
## Call: gam(formula = formula_spline1, family = "gaussian", data = data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.20659 -0.27978 -0.02599 0.30743 1.21821
##
## (Dispersion Parameter for gaussian family taken to be 0.4454)
##
## Null Deviance: 127.9177 on 96 degrees of freedom
## Residual Deviance: 26.7268 on 60.0006 degrees of freedom
## AIC: 226.2356
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(lcavol, df = 6) 1.000 73.956 73.956 166.0283 < 2.2e-16 ***
## s(lweight, df = 6) 1.000 9.572 9.572 21.4890 1.969e-05 ***
## s(age, df = 6) 1.000 1.151 1.151 2.5830 0.11326
## s(lbph, df = 6) 1.000 1.332 1.332 2.9895 0.08895 .
## s(lcp, df = 6) 1.000 0.530 0.530 1.1905 0.27959
## s(pgg45, df = 6) 1.000 2.076 2.076 4.6606 0.03487 *
## Residuals 60.001 26.727 0.445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(lcavol, df = 6) 5 1.2698 0.28879
## s(lweight, df = 6) 5 1.8641 0.11413
## s(age, df = 6) 5 1.1354 0.35168
## s(lbph, df = 6) 5 1.2182 0.31172
## s(lcp, df = 6) 5 2.0645 0.08241 .
## s(pgg45, df = 6) 5 1.9241 0.10357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
formula_loess <- str_c("lo(",
names(data %>% select(-lpsa)),
")",
collapse = "+")
formula_loess <- as.formula(str_c("lpsa~",formula_loess))
mod_loess <- gam(formula_loess,data=data,family = "gaussian")
summary(mod_loess)
##
## Call: gam(formula = formula_loess, family = "gaussian", data = data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.21683 -0.31661 -0.05531 0.33127 1.58643
##
## (Dispersion Parameter for gaussian family taken to be 0.4708)
##
## Null Deviance: 127.9177 on 96 degrees of freedom
## Residual Deviance: 33.7462 on 71.6827 degrees of freedom
## AIC: 225.4919
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## lo(lcavol) 1.000 70.139 70.139 148.9879 < 2.2e-16 ***
## lo(lweight) 1.000 8.634 8.634 18.3390 5.645e-05 ***
## lo(age) 1.000 0.967 0.967 2.0541 0.15614
## lo(lbph) 1.000 1.058 1.058 2.2468 0.13828
## lo(lcp) 1.000 0.294 0.294 0.6254 0.43166
## lo(pgg45) 1.000 2.230 2.230 4.7374 0.03281 *
## Residuals 71.683 33.746 0.471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## lo(lcavol) 3.0 1.50321 0.22110
## lo(lweight) 3.5 1.78736 0.14804
## lo(age) 2.8 1.02817 0.38235
## lo(lbph) 3.6 0.99659 0.40935
## lo(lcp) 2.8 2.49655 0.07014 .
## lo(pgg45) 2.5 1.91416 0.14385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_linear <- glm(lpsa~.,data=data,family = "gaussian")
summary(mod_linear)
##
## Call:
## glm(formula = lpsa ~ ., family = "gaussian", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45705 -0.39398 -0.02834 0.42085 1.79805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.356988 0.914975 0.390 0.69734
## lcavol 0.604602 0.089272 6.773 1.26e-09 ***
## lweight 0.668865 0.207327 3.226 0.00175 **
## age -0.019940 0.011508 -1.733 0.08657 .
## lbph 0.069019 0.059651 1.157 0.25032
## lcp 0.012591 0.085185 0.148 0.88283
## pgg45 0.006018 0.003542 1.699 0.09277 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.5326767)
##
## Null deviance: 127.918 on 96 degrees of freedom
## Residual deviance: 47.941 on 90 degrees of freedom
## AIC: 222.91
##
## Number of Fisher Scoring iterations: 2
task_prostate <- TaskRegr$new(id = "age",backend = data,target = "lpsa")
rcv = rsmp("cv",folds=10)
set.seed(123)
index_rcv <- rcv$instantiate(task_prostate)$instance
head(index_rcv)
## row_id fold
## 1: 1 1
## 2: 3 1
## 3: 27 1
## 4: 47 1
## 5: 49 1
## 6: 51 1
gam_all <- function(train,test,mape=FALSE,response){
#GAM Spline
formula_spline1 <- str_c("s(",
names(data %>% select(-lpsa)),
",df=",6,")",
collapse = "+")
formula_spline1 <- as.formula(str_c("lpsa~",formula_spline1))
mod_spline1 <- gam(formula_spline1,data=train,family = "gaussian")
pred_spline1 <- predict(mod_spline1,newdata=test)
rmse_spline1<- mlr3measures::rmse(response = pred_spline1,
truth = test %>% pull(response))
mape_spline1 <- mlr3measures::mape(response = pred_spline1,
truth = test %>% pull(response))
# GAM LOESS
formula_loess <- str_c("lo(",
names(data %>% select(-lpsa)),
")",
collapse = "+")
formula_loess <- as.formula(str_c("lpsa~",formula_loess))
mod_loess <- gam(formula_loess,data=train,family = "gaussian")
pred_loess <- predict(mod_loess,newdata=test)
rmse_loess <- mlr3measures::rmse(response = pred_loess,
truth = test %>% pull(response))
mape_loess <- mlr3measures::mape(response = pred_loess,
truth = test %>% pull(response))
# Regresi Linear
mod_linear <- glm(lpsa~.,data=data,family = "gaussian")
pred_linear <- predict(mod_linear,newdata=test)
rmse_linear <- mlr3measures::rmse(response = pred_linear,
truth = test %>% pull(response))
mape_linear <- mlr3measures::mape(response = pred_linear,
truth = test %>% pull(response))
mape_all = c(mape_linear,mape_loess,mape_spline1)
names(mape_all) <- c("Linear","LOESS","Spline")
rmse_all = c(rmse_linear,rmse_loess,rmse_spline1)
names(rmse_all) <- c("Linear","LOESS","Spline")
if(mape){
return(mape_all)
}else{
return(rmse_all)
}
}
score_all <- map_df(1:10,function(i){
index_train <- index_rcv %>%
filter(fold!=i) %>%
pull(row_id)
index_test <- index_rcv %>%
filter(fold==i) %>%
pull(row_id)
score <- gam_all(response="lpsa",train = data[index_train,],
test = data[index_test,],
mape = TRUE
)
return(score)
})
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## lo.wam convergence not obtained in 30 iterations
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 79
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 78.185
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 2.6568
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 2.6038
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 2.9042
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## upperlimit 2.6038
## Warning in gam.lo(data[["lo(lcp)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : eval 100
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : upperlimit 95.475
## Warning in gam.lo(data[["lo(pgg45)"]], z, w, span = 0.5, degree = 1, ncols = 1,
## : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 3.821
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : upperlimit 3.4961
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 2.3749
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : lowerlimit 2.6808
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## eval 41
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## lowerlimit 42.82
## Warning in gam.lo(data[["lo(age)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval -1.3471
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : lowerlimit -1.2291
## Warning in gam.lo(data[["lo(lcavol)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : eval 4.7804
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : upperlimit 4.7298
## Warning in gam.lo(data[["lo(lweight)"]], z, w, span = 0.5, degree = 1, ncols =
## 1, : extrapolation not allowed with blending
colMeans(score_all,na.rm = TRUE)
## Linear LOESS Spline
## 0.4235063 0.5201586 0.5389620
Pemodelan data Bone Mineral Density dengan Regression Tree + Bagging, Random Forest, dan Gradient Boosting
library(rsample) # data splitting
library(dplyr) # data wrangling
library(rpart) # performing regression trees
library(rpart.plot) # plotting regression trees
library(ipred) # bagging
library(caret) # bagging
Bone_raw <- read.csv("bone.csv", stringsAsFactors = FALSE)
summary(Bone_raw)
## idnum ethnic age sex
## Min. : 1.0 Length:1003 Min. : 8.80 Length:1003
## 1st Qu.: 84.5 Class :character 1st Qu.:12.80 Class :character
## Median :181.0 Mode :character Median :15.70 Mode :character
## Mean :189.9 Mean :16.32
## 3rd Qu.:287.5 3rd Qu.:19.45
## Max. :429.0 Max. :26.20
## spnbmd
## Min. :0.5360
## 1st Qu.:0.7995
## Median :0.9650
## Mean :0.9476
## 3rd Qu.:1.0705
## Max. :1.4430
str(Bone_raw)
## 'data.frame': 1003 obs. of 5 variables:
## $ idnum : int 1 1 1 1 2 2 2 2 3 3 ...
## $ ethnic: chr "White" "White" "White" "White" ...
## $ age : num 11.2 12.2 13.2 14.3 12.7 13.8 14.8 15.8 10.9 11.9 ...
## $ sex : chr "mal" "mal" "mal" "mal" ...
## $ spnbmd: num 0.719 0.732 0.776 0.781 0.62 0.627 0.759 0.79 0.641 0.622 ...
head(Bone_raw)
## idnum ethnic age sex spnbmd
## 1 1 White 11.2 mal 0.719
## 2 1 White 12.2 mal 0.732
## 3 1 White 13.2 mal 0.776
## 4 1 White 14.3 mal 0.781
## 5 2 White 12.7 mal 0.620
## 6 2 White 13.8 mal 0.627
Bone <- subset(Bone_raw,select=-c(idnum))
head(Bone)
## ethnic age sex spnbmd
## 1 White 11.2 mal 0.719
## 2 White 12.2 mal 0.732
## 3 White 13.2 mal 0.776
## 4 White 14.3 mal 0.781
## 5 White 12.7 mal 0.620
## 6 White 13.8 mal 0.627
summary(Bone)
## ethnic age sex spnbmd
## Length:1003 Min. : 8.80 Length:1003 Min. :0.5360
## Class :character 1st Qu.:12.80 Class :character 1st Qu.:0.7995
## Mode :character Median :15.70 Mode :character Median :0.9650
## Mean :16.32 Mean :0.9476
## 3rd Qu.:19.45 3rd Qu.:1.0705
## Max. :26.20 Max. :1.4430
str(Bone)
## 'data.frame': 1003 obs. of 4 variables:
## $ ethnic: chr "White" "White" "White" "White" ...
## $ age : num 11.2 12.2 13.2 14.3 12.7 13.8 14.8 15.8 10.9 11.9 ...
## $ sex : chr "mal" "mal" "mal" "mal" ...
## $ spnbmd: num 0.719 0.732 0.776 0.781 0.62 0.627 0.759 0.79 0.641 0.622 ...
# Split data
set.seed(123)
bone_split <- createDataPartition(Bone$spnbmd, p = .7, list = FALSE)
bone_train <- Bone[bone_split, ]
bone_test <- Bone[-bone_split, ]
Regression Tree
m1 <- rpart(
formula = spnbmd ~ .,
data = bone_train,
method = "anova"
)
rpart.plot(m1)
plotcp(m1)
m1$cptable
## CP nsplit rel error xerror xstd
## 1 0.43542953 0 1.0000000 1.0005783 0.04483880
## 2 0.03755374 1 0.5645705 0.6096417 0.03699330
## 3 0.03317541 2 0.5270167 0.5618151 0.03299311
## 4 0.02730405 3 0.4938413 0.5319004 0.03017752
## 5 0.02078144 4 0.4665373 0.5246121 0.02972519
## 6 0.01912976 5 0.4457558 0.5126399 0.02884200
## 7 0.01000000 6 0.4266261 0.4843469 0.02743718
hyper_grid <- expand.grid(
minsplit = seq(1, 20, 1),
maxdepth = seq(1, 15, 1)
)
models <- vector(mode = "list", nrow(hyper_grid))
for (i in 1:nrow(hyper_grid)){
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
models[[i]] <- rpart(
formula = spnbmd ~ .,
data = bone_train,
method = "anova",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}
get_cp <- function(x) {
min <- which.min(x$cptable[, "xerror"])
cp <- x$cptable[min, "CP"]
}
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"])
xerror <- x$cptable[min, "xerror"]
}
hyper_grid %>%
mutate(
cp = purrr::map_dbl(models, get_cp),
error = purrr::map_dbl(models, get_min_error)
) %>%
arrange(error) %>%
top_n(-5, wt = error)
## minsplit maxdepth cp error
## 1 6 9 0.01 0.4492643
## 2 5 3 0.01 0.4512283
## 3 19 12 0.01 0.4544440
## 4 8 14 0.01 0.4547218
## 5 4 9 0.01 0.4552722
optimal_tree <- rpart(
formula = spnbmd ~ .,
data = bone_train,
method = "anova",
control = list(minsplit = 15, maxdepth = 10, cp = 0.01)
)
pred <- predict(optimal_tree, newdata = bone_test)
RMSE(pred = pred, obs = bone_test$spnbmd)
## [1] 0.132335
MAE(pred = pred, obs = bone_test$spnbmd)
## [1] 0.1018731
Bagging
set.seed(123)
bagged_m1 <- bagging(
formula = spnbmd ~ .,
data = bone_train,
coob = TRUE
)
bagged_m1
##
## Bagging regression trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = spnbmd ~ ., data = bone_train, coob = TRUE)
##
## Out-of-bag estimate of root mean squared error: 0.1234
ntree <- 10:50
rmse <- vector(mode = "numeric", length = length(ntree))
for (i in seq_along(ntree)) {
set.seed(123)
model <- bagging(
formula = spnbmd ~ .,
data = bone_train,
coob = TRUE,
nbagg = ntree[i]
)
rmse[i] <- model$err
}
plot(ntree, rmse, type = 'l', lwd = 2)
abline(v = 25, col = "red", lty = "dashed")
# Cross validation 10 K-fold
ctrl <- trainControl(method = "cv", number = 10)
# CV bagged model
bagged_cv <- train(
spnbmd ~ .,
data = bone_train,
method = "treebag",
trControl = ctrl,
importance = TRUE
)
# assess results
bagged_cv
## Bagged CART
##
## 703 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 633, 632, 632, 634, 634, 631, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1226359 0.5733349 0.0970862
plot(varImp(bagged_cv), 2)
pred <- predict(bagged_cv, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1235616
MAE(pred, bone_test$spnbmd)
## [1] 0.09727683
Random Forest
library(randomForest)
mod_forest<-train(spnbmd~., data=bone_train, method="rf", prox=TRUE)
mod_forest
## Random Forest
##
## 703 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 703, 703, 703, 703, 703, 703, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1242096 0.5722359 0.09798122
## 3 0.1258679 0.5492926 0.09867422
## 5 0.1387192 0.4803627 0.10906588
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
pred <- predict(mod_forest, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1209239
MAE(pred, bone_test$spnbmd)
## [1] 0.09754176
Gradient Boosting
library(mlbench)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
set.seed(1234)
gbmFit1 <- train(spnbmd~., data=bone_train, method="gbm", trControl = fitControl, verbose=F)
gbmFit1$results[which.max(gbmFit1$result[,4]),] # best result
## shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
## 3 0.1 1 10 150 0.122378 0.575094
## MAE RMSESD RsquaredSD MAESD
## 3 0.0967561 0.01048381 0.06016369 0.00869704
summary(gbmFit1)
## var rel.inf
## age age 82.5082846
## sexmal sexmal 10.5073778
## ethnicBlack ethnicBlack 5.4069137
## ethnicHispanic ethnicHispanic 0.8476221
## ethnicWhite ethnicWhite 0.7298017
pred <- predict(gbmFit1, bone_test)
RMSE(pred, bone_test$spnbmd)
## [1] 0.1201753
MAE(pred, bone_test$spnbmd)
## [1] 0.09563929