The ranger package (Wright & Ziegler, 2017) provides functions for running random forests. The caret package (Kuhn, 2018) provides an easy-to-use interface for grid searches and cross-validation that allows us to set optimal hyperparameters for the random forest model. The lattice package (Sarkar, 2008) contains the dotplot() function for plotting variable importance.The party package (Horthorn et al, 2019) provides the ctree() function to build conditional inference trees to visualize the effect of the most important predictors, determined by the random forest analyses.
library(ranger)
library(caret)
library(lattice)
library(party)
# Load the individual characteristics
indiv_char <- read.table("indiv_char.txt", sep = " ", dec = ".", header = TRUE)
# Load the difference waves for the N400 and the P600 effects
P600 <- read.table("P600.txt", sep = "\t", dec = ".", header = TRUE)
N400 <- read.table("N400.txt", sep = "\t", dec = ".", header = TRUE)
# Merge those files
P600_rf <- merge(P600,indiv_char, by = c("Subj_NUM"))
N400_rf <- merge(N400,indiv_char, by = c("Subj_NUM"))
We set a seed to ensure that results are replicable
set.seed(19)
This function was developed by Tomaschek, Hendrix & Baayen (2018)
# Get the existing method for the ranger() function
ranger_type = getModelInfo(model = "ranger")
ranger_type
# Change the parameters that may be tuned to include num.trees
ranger_type$ranger$parameters = data.frame("parameter" = c("mtry", "num.trees"),
"class" = c("numeric", "numeric"),
"label" = c("Selected Predictors",
"Number of Trees"))
# Edit the model fit function to include a num.trees argument in the call to the ranger function()
ranger_type$ranger$fit = function (x, y, wts, param, lev, last, classProbs,
...) {
if ((!is.data.frame(x)) || dplyr::is.tbl(x))
x <- as.data.frame(x)
x$.outcome <- y
if (!is.null(wts)) {
out <- ranger::ranger(dependent.variable.name = ".outcome",
data = x, mtry = param$mtry, num.trees = param$num.trees,
probability = classProbs, case.weights = wts, ...)
}
else {
out <- ranger::ranger(dependent.variable.name = ".outcome",
data = x, mtry = param$mtry, num.trees = param$num.trees,
write.forest = TRUE, probability = classProbs, ...)
}
if (!last)
out$y <- y
out
}
# Define grid for the number of trees and mtry
num_trees = round(exp(seq(1, 10, by = 0.5)))
num_trees
## [1] 3 4 7 12 20 33 55 90 148 245 403
## [12] 665 1097 1808 2981 4915 8103 13360 22026
m_try = seq.int(from = 1, to = 9, by = 1)
m_try
## [1] 1 2 3 4 5 6 7 8 9
tuneGrid = expand.grid(mtry = m_try, num.trees = num_trees)
# Define the cross-validation parameters
control = trainControl(method = "cv", number = 10)
# Run the grid search using the train() function of the caret package
N400_train1 = train(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Determine the best tune within the coarse search
N400_train1$bestTune
## mtry num.trees
## 120 7 33
# Define number of trees for a finer search
num_trees = c(55,60,65,70,75,80,85,90)
num_trees
## [1] 55 60 65 70 75 80 85 90
# Run the grid with the appropriate number of mtry
tuneGrid = expand.grid(mtry = 9, num.trees = num_trees)
# Train the model for the finer search
N400_train2 = train(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Get the results
N400_train2$results
## mtry num.trees RMSE Rsquared MAE RMSESD RsquaredSD
## 1 9 55 0.6413150 0.7470572 0.4857619 0.1233127 0.1141406
## 2 9 60 0.6355050 0.7534881 0.4878034 0.1299704 0.1147481
## 3 9 65 0.6367395 0.7541397 0.4820074 0.1351460 0.1070299
## 4 9 70 0.6277046 0.7572004 0.4735017 0.1242126 0.1101118
## 5 9 75 0.6169342 0.7658238 0.4737354 0.1258288 0.1181183
## 6 9 80 0.6330687 0.7546658 0.4832025 0.1325586 0.1132532
## 7 9 85 0.6231476 0.7624165 0.4735788 0.1019422 0.1106340
## 8 9 90 0.6406396 0.7486418 0.4907704 0.1251805 0.1156777
## MAESD
## 1 0.07321463
## 2 0.08384818
## 3 0.08125665
## 4 0.08011085
## 5 0.08018684
## 6 0.08432501
## 7 0.07354542
## 8 0.07718312
# Get the optimal parameters
N400_train2$bestTune
## mtry num.trees
## 5 9 75
N400_SYN_rf = ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, num.trees = 75, mtry = 9, importance = "permutation")
# Display the results
N400_SYN_rf
## Ranger result
##
## Call:
## ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, num.trees = 75, mtry = 9, importance = "permutation")
##
## Type: Regression
## Number of trees: 75
## Sample size: 228
## Number of independent variables: 9
## Mtry: 9
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 0.3447479
## R squared (OOB): 0.7801556
varimps = round(ranger::importance(N400_SYN_rf),3)
# Show variable importance
rev(sort(varimps))
## perf_SYN Cloz Expos_percent Lex Expos_years
## 1.235 0.934 0.854 0.766 0.734
## AoE Digit AoA Group
## 0.344 0.288 0.057 0.009
# Define plot function
plotCoefficientsRF.fnc = function(importance, color = "#CD5555",title="") {
# Order importance
importance = importance[order(importance)]
# Define predictor names
predictors = names(importance)
# Define colors for non-zero and zero importance
colors = c(color,paste0(color,"66"))
# Plot
dotplot(importance, labels = predictors,
xlab = "variable importance", ylab = "predictors",
col = colors[as.numeric(importance==0)+1], pch = 19,
scales = list(x = list(at = seq(-1, 2, by = 0.1),
labels = round(seq(-1, 2, by = 0.1),1))),
panel = function(...) {
panel.abline(v = 0, lty = "dotted", col = "black")
panel.dotplot(...)
},
par.settings = list(fontsize = list(text = 12, points = 10))
)
}
# Set plotting parameters
par(mfrow = c(1, 1))
# Plot
plotCoefficientsRF.fnc(importance = varimps, color = "#CD5555",
title = "segment duration")
# Random forest model with the most important variables that varies minimally in R squared
N400_SYN_small = ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent + AoE + Digit + AoA, data = N400_rf, num.trees = 75, mtry = 8,
importance = "permutation")
N400_SYN_small
## Ranger result
##
## Call:
## ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent + AoE + Digit + AoA, data = N400_rf, num.trees = 75, mtry = 8, importance = "permutation")
##
## Type: Regression
## Number of trees: 75
## Sample size: 228
## Number of independent variables: 8
## Mtry: 8
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 0.3280273
## R squared (OOB): 0.7908182
# Tree using the ctree function with those predictors
SYN_N400.ctree = party::ctree(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit, data = N400_rf)
plot(SYN_N400.ctree, type = "simple")
# Define grid for the number of trees and mtry
num_trees = round(exp(seq(1, 10, by = 0.5)))
num_trees
## [1] 3 4 7 12 20 33 55 90 148 245 403
## [12] 665 1097 1808 2981 4915 8103 13360 22026
m_try = seq.int(from = 1, to = 9, by = 1)
m_try
## [1] 1 2 3 4 5 6 7 8 9
tuneGrid = expand.grid(mtry = m_try, num.trees = num_trees)
# Define the cross-validation parameters
control = trainControl(method = "cv", number = 10)
# Run the grid search using the train() function of the caret package
P600_train1 = train(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Determine the best tune within the coarse search
P600_train1$bestTune
## mtry num.trees
## 80 5 12
# Define number of trees for a finer search
num_trees = c(55,60,65,70,75,80,85,90)
num_trees
## [1] 55 60 65 70 75 80 85 90
# Run the grid with the appropriate number of mtry
tuneGrid = expand.grid(mtry = 9, num.trees = num_trees)
# Train the model for the finer search
P600_train2 = train(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Get the results
P600_train2$results
## mtry num.trees RMSE Rsquared MAE RMSESD RsquaredSD
## 1 9 55 0.8702288 0.8638882 0.6761300 0.1300984 0.04975262
## 2 9 60 0.8738700 0.8655279 0.6834018 0.1694993 0.04616770
## 3 9 65 0.8324870 0.8797935 0.6423902 0.1415153 0.02757596
## 4 9 70 0.8358277 0.8752699 0.6509530 0.1264561 0.04104217
## 5 9 75 0.8446587 0.8718606 0.6622036 0.1364608 0.04286072
## 6 9 80 0.8541157 0.8726248 0.6684847 0.1484835 0.03729327
## 7 9 85 0.8561210 0.8715816 0.6610140 0.1386923 0.03797779
## 8 9 90 0.8477884 0.8715780 0.6575440 0.1217036 0.03999921
## MAESD
## 1 0.10454606
## 2 0.14152867
## 3 0.11841205
## 4 0.10383647
## 5 0.11360874
## 6 0.12086884
## 7 0.10409330
## 8 0.09533142
# Get the optimal parameters
P600_train2$bestTune
## mtry num.trees
## 3 9 65
P600_SYN_rf = ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, num.trees = 85, mtry = 9, importance = "permutation")
# Display the results
P600_SYN_rf
## Ranger result
##
## Call:
## ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, num.trees = 85, mtry = 9, importance = "permutation")
##
## Type: Regression
## Number of trees: 85
## Sample size: 228
## Number of independent variables: 9
## Mtry: 9
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 0.5555332
## R squared (OOB): 0.898504
varimps = round(ranger::importance(P600_SYN_rf),3)
# Show variable importance
rev(sort(varimps))
## Expos_percent Lex perf_SYN Cloz Expos_years
## 15.473 5.560 1.846 1.302 0.987
## AoE Digit AoA Group
## 0.752 0.585 0.218 0.095
# Define plot function
plotCoefficientsRF.fnc = function(importance, color = "#CD5555",title="") {
# Order importance
importance = importance[order(importance)]
# Define predictor names
predictors = names(importance)
# Define colors for non-zero and zero importance
colors = c(color,paste0(color,"66"))
# Plot
dotplot(importance, labels = predictors,
xlab = "variable importance", ylab = "predictors",
col = colors[as.numeric(importance==0)+1], pch = 19,
scales = list(x = list(at = seq(-1, 20, by = 1),
labels = round(seq(-1, 20, by = 1),1))),
panel = function(...) {
panel.abline(v = 0, lty = "dotted", col = "black")
panel.dotplot(...)
},
par.settings = list(fontsize = list(text = 12, points = 10))
)
}
# Set plotting parameters
par(mfrow = c(1, 1))
# Plot
plotCoefficientsRF.fnc(importance = varimps, color = "#CD5555",
title = "segment duration")
# Random forest model with the most important variables that varies minimally in R squared
P600_SYN_small = ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent, data = P600_rf, num.trees = 85, mtry = 5, importance = "permutation")
P600_SYN_small
## Ranger result
##
## Call:
## ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent, data = P600_rf, num.trees = 85, mtry = 5, importance = "permutation")
##
## Type: Regression
## Number of trees: 85
## Sample size: 228
## Number of independent variables: 5
## Mtry: 5
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 0.6073759
## R squared (OOB): 0.8890324
# Tree using the ctree function with those predictors
SYN_P600.ctree = party::ctree(P600_SYN_diff ~perf_SYN + Cloz + Lex + Expos_years + Expos_percent, data = P600_rf)
plot(SYN_P600.ctree, type = "simple")