library(rpart)
RP <- rpart(VulpesVulpes ~ 1+ bio3 + bio7 + bio11 + bio12,
data = mammals_data,
control = rpart.control(minsplit = 20, xval = 10),
method = "class")
plot(RP, uniform = F, margin = 0.1, branch = 0.5, compress = T)
text(RP, cex = 0.8)
vul_prob <- mammals_data %>% mutate(prob = predict(RP, type = "prob") [,2]) %>% select(X_WGS84, Y_WGS84, VulpesVulpes, prob)
ggplot(vul_prob, aes(x = X_WGS84, y = Y_WGS84)) + geom_tile(aes(fill = VulpesVulpes)) + ggtitle('Original data')
ggplot(vul_prob, aes(x = X_WGS84, y = Y_WGS84)) + geom_tile(aes(fill = prob)) + ggtitle('Recursive partitioning')
Recursive Partitioning にアンサンブルモデリングの手法を加えたものを紹介する
アンサブルモデリング 分類や回帰の手法に元のデータセットに対するさま再サンプリングなどがくみあわされたもの
バギング 無作為に復元抽出したデータに対してモデルを適用し、その結果をまとめたものを予測とする手法
ブースティング 構築済みのモデルの残差に対して繰り返し新しいモデルを当てはめていく手法
RPの問題:
↓
set.seed(555)
rp_pantheraonca <- rpart(PantheraOnca ~ bio3 + bio7 + bio11 + bio12,
data = mammals_data,
control = rpart.control(xval = 10), method = "class")
plot(rp_pantheraonca, uniform = F, margin = 0.1, branch = 0.5, compress = T)
text(rp_pantheraonca, cex = 0.8)
trees <- vector(mode = "list", length = 50)
n <- nrow(mammals_data)
## n面のサイコロをn回振った
boot <- rmultinom(length(trees), n, rep(1,n)/n)
full_tree <- rpart(PantheraOnca ~ bio3 + bio7 + bio11 + bio12,
data = mammals_data,
control = rpart.control(xval = 0), method = "class")
plot(full_tree, uniform = F, margin = 0.1, branch = 0.5, compress = T)
text(full_tree, cex = 0.8)
for(i in 1:length(trees)) {
trees[[i]] <- update(full_tree, weights = boot[,i])
}
## 分岐ごとにどの変数が選ばれているのか
table(sapply(trees, function(x) as.character(x$frame$var[1]))) # 1
##
## bio3
## 50
table(sapply(trees, function(x) as.character(x$frame$var[3]))) # 2
##
## <leaf> bio12 bio7
## 4 1 45
table(sapply(trees, function(x) as.character(x$frame$var[5]))) # 3
##
## <leaf> bio11 bio12 bio3 bio7
## 4 14 29 2 1
table(sapply(trees, function(x) as.character(x$frame$var[10]))) # 4
##
## <leaf> bio11 bio12 bio3 bio7
## 27 5 2 6 10
pred <- trees %>%
map_dfc(~predict(.x, newdata = select(mammals_data, bio3, bio7, bio11, bio12),
type = "prob")[,2]) %>% as.matrix
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## * NA -> ...5
## * ...
pred_avg <- rowMeans(pred)
require(pROC, quietly = T)
roc_avg <- roc(mammals_data$PantheraOnca,
pred_avg,
percent = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_avg <- auc(roc_avg) %>% as.numeric
auc_avg
## [1] 95.75894
roc_rp <- roc(mammals_data$PantheraOnca,
predict(rp_pantheraonca, type = "prob")[,2],
percent = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rp <- auc(roc_rp) %>% as.numeric
auc_rp
## [1] 86.7767
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf <- randomForest(x = select(mammals_data, bio3, bio7, bio11, bio12),
y = as.factor(mammals_data$VulpesVulpes),
ntree = 1000,
importance = TRUE)
importance(rf)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## bio3 92.06599 79.24433 141.60752 1794.3400
## bio7 37.68358 419.66958 162.34969 951.8239
## bio11 52.82318 56.47513 85.72112 1055.6656
## bio12 114.52820 88.20139 152.05215 467.5669
rf_pred <- mammals_data %>% bind_cols(pred = predict(rf, type = "prob")[,2] )
ggplot(rf_pred, aes(x = X_WGS84, y = Y_WGS84)) + geom_tile(aes(fill = VulpesVulpes)) + ggtitle('Original data')
ggplot(rf_pred, aes(x = X_WGS84, y = Y_WGS84)) + geom_tile(aes(fill = pred)) + ggtitle('RF')
rp <- biomod2::response.plot2(models = c('rf'),
Data = mammals_data[,c("bio3", "bio7", "bio11", "bio12")] %>% as.matrix,
show.variables = c("bio3", "bio7", "bio11", "bio12"),
fixed.var.metric = 'mean', plot = FALSE, use.formal.names = TRUE)
## Warning in cbind(bio3 = pts.tmp, ref_table[, -which(colnames(ref_table) == :
## number of rows of result is not a multiple of vector length (arg 1)
## Warning in cbind(bio7 = pts.tmp, ref_table[, -which(colnames(ref_table) == :
## number of rows of result is not a multiple of vector length (arg 1)
## Warning in cbind(bio11 = pts.tmp, ref_table[, -which(colnames(ref_table) == :
## number of rows of result is not a multiple of vector length (arg 1)
## Warning in cbind(bio12 = pts.tmp, ref_table[, -which(colnames(ref_table) == :
## number of rows of result is not a multiple of vector length (arg 1)
gg.rp <- ggplot(rp, aes(x = expl.val, y = pred.val, lty = pred.name)) +
geom_line() + ylab("prob of occ") + xlab("") +
rp.gg.theme +
facet_grid(~ expl.name, scales = 'free_x')
print(gg.rp)
小さな木を、非復元でサンプリングしたデータに次々適用していく方法。
BRTのパラメーター
木の数(n.trees)
学習率:ひとつひとつの木の重要度
交互作用深度:ひとつひとつの木の複雑さ(interaction.depth)
基本的には、木の数は多く、学習率は低く
交互作用深度が大きいときは、木の数を少なく、学習率を高くする
交互作用深度は2〜10がよさそう
gdmパッケージによるBRTの実行
library(gbm)
gbm_mod <- gbm(VulpesVulpes ~ bio3 + bio7 + bio11 + bio12,
data = mammals_data,
distribution = "bernoulli",
n.trees = 10000,
interaction.depth = 3,
shrinkage = 0.01,
bag.fraction = 0.5,
cv.folds = 10)
## saveRDS(gbm_mod, "gbm_mod.rds")
## gbm_mod <- readRDS("./gbm_mod.rds")