Section 3. Growing random forests for the Syntactic minus Correct condition

3.1. Preliminaries

Packages

The ranger package (Wright & Ziegler, 2017) provides functions for running random forests. The caret package (Kuhn, 2018) provides an easy-to-use interface for grid searches and cross-validation that allows us to set optimal hyperparameters for the random forest model. The lattice package (Sarkar, 2008) contains the dotplot() function for plotting variable importance.The party package (Horthorn et al, 2019) provides the ctree() function to build conditional inference trees to visualize the effect of the most important predictors, determined by the random forest analyses.

library(ranger)
library(caret)
library(lattice)
library(party)

Load datasets

# Load the individual characteristics
indiv_char <- read.table("indiv_char.txt", sep = " ", dec = ".", header = TRUE)
# Load the difference waves for the N400 and the P600 effects
P600 <- read.table("P600.txt", sep = "\t", dec = ".", header = TRUE)
N400 <- read.table("N400.txt", sep = "\t", dec = ".", header = TRUE)
# Merge those files
P600_rf <- merge(P600,indiv_char, by = c("Subj_NUM"))
N400_rf <- merge(N400,indiv_char, by = c("Subj_NUM"))

Set seed

We set a seed to ensure that results are replicable

set.seed(19)

Tune the ranger() function so we can include num.trees as an argument

This function was developed by Tomaschek, Hendrix & Baayen (2018)

# Get the existing method for the ranger() function
ranger_type = getModelInfo(model = "ranger")
ranger_type
# Change the parameters that may be tuned to include num.trees
ranger_type$ranger$parameters = data.frame("parameter" = c("mtry", "num.trees"),
                                           "class" = c("numeric", "numeric"),
                                           "label" = c("Selected Predictors", 
                                                       "Number of Trees"))

# Edit the model fit function to include a num.trees argument in the call to the ranger function()
ranger_type$ranger$fit = function (x, y, wts, param, lev, last, classProbs, 
                                   ...) {
  if ((!is.data.frame(x)) || dplyr::is.tbl(x)) 
    x <- as.data.frame(x)
  x$.outcome <- y
  if (!is.null(wts)) {
    out <- ranger::ranger(dependent.variable.name = ".outcome", 
                          data = x, mtry = param$mtry, num.trees = param$num.trees,
                          probability = classProbs, case.weights = wts, ...)
  }
  else {
    out <- ranger::ranger(dependent.variable.name = ".outcome", 
                          data = x, mtry = param$mtry, num.trees = param$num.trees, 
                          write.forest = TRUE, probability = classProbs, ...)
  }
  if (!last) 
    out$y <- y
  out
}

3.2. The N400 effect (at C3-Cz-C4)

Determine optimal hyperparameters

Use the train() function to define optimal mtry and num.trees values

# Define grid for the number of trees and mtry
num_trees = round(exp(seq(1, 10, by = 0.5)))
num_trees
##  [1]     3     4     7    12    20    33    55    90   148   245   403
## [12]   665  1097  1808  2981  4915  8103 13360 22026
m_try = seq.int(from = 1, to = 9, by = 1)
m_try
## [1] 1 2 3 4 5 6 7 8 9
tuneGrid = expand.grid(mtry = m_try, num.trees = num_trees)
# Define the cross-validation parameters
control = trainControl(method = "cv", number = 10)
# Run the grid search using the train() function of the caret package
N400_train1 = train(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Determine the best tune within the coarse search
N400_train1$bestTune
##     mtry num.trees
## 120    7        33
# Define number of trees for a finer search
num_trees = c(55,60,65,70,75,80,85,90)
num_trees
## [1] 55 60 65 70 75 80 85 90
# Run the grid with the appropriate number of mtry
tuneGrid = expand.grid(mtry = 9, num.trees = num_trees)
# Train the model for the finer search
N400_train2 = train(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Get the results
N400_train2$results
##   mtry num.trees      RMSE  Rsquared       MAE    RMSESD RsquaredSD
## 1    9        55 0.6413150 0.7470572 0.4857619 0.1233127  0.1141406
## 2    9        60 0.6355050 0.7534881 0.4878034 0.1299704  0.1147481
## 3    9        65 0.6367395 0.7541397 0.4820074 0.1351460  0.1070299
## 4    9        70 0.6277046 0.7572004 0.4735017 0.1242126  0.1101118
## 5    9        75 0.6169342 0.7658238 0.4737354 0.1258288  0.1181183
## 6    9        80 0.6330687 0.7546658 0.4832025 0.1325586  0.1132532
## 7    9        85 0.6231476 0.7624165 0.4735788 0.1019422  0.1106340
## 8    9        90 0.6406396 0.7486418 0.4907704 0.1251805  0.1156777
##        MAESD
## 1 0.07321463
## 2 0.08384818
## 3 0.08125665
## 4 0.08011085
## 5 0.08018684
## 6 0.08432501
## 7 0.07354542
## 8 0.07718312
# Get the optimal parameters
N400_train2$bestTune
##   mtry num.trees
## 5    9        75

Random forest model

Run the actual model using the ranger() function with the optimal hyperparameters
N400_SYN_rf = ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = N400_rf, num.trees = 75, mtry = 9, importance = "permutation")
# Display the results
N400_SYN_rf
## Ranger result
## 
## Call:
##  ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years +      AoA + AoE + Expos_percent + Digit + Group, data = N400_rf,      num.trees = 75, mtry = 9, importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  75 
## Sample size:                      228 
## Number of independent variables:  9 
## Mtry:                             9 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.3447479 
## R squared (OOB):                  0.7801556

Plot variable importance

varimps = round(ranger::importance(N400_SYN_rf),3)


# Show variable importance
rev(sort(varimps))
##      perf_SYN          Cloz Expos_percent           Lex   Expos_years 
##         1.235         0.934         0.854         0.766         0.734 
##           AoE         Digit           AoA         Group 
##         0.344         0.288         0.057         0.009
# Define plot function
plotCoefficientsRF.fnc = function(importance, color = "#CD5555",title="") {
  
  # Order importance
  importance = importance[order(importance)]
  
  # Define predictor names
  predictors = names(importance)
  
  # Define colors for non-zero and zero importance
  colors = c(color,paste0(color,"66"))
  
  # Plot
  dotplot(importance, labels = predictors,
          xlab = "variable importance", ylab = "predictors",
          col = colors[as.numeric(importance==0)+1], pch = 19,
          scales = list(x = list(at = seq(-1, 2, by = 0.1),
                                 labels = round(seq(-1, 2, by = 0.1),1))),
          panel = function(...) {
            panel.abline(v = 0, lty = "dotted", col = "black")
            panel.dotplot(...)
          },
          par.settings = list(fontsize = list(text = 12, points = 10))
  )
  
}

# Set plotting parameters
par(mfrow = c(1, 1))

# Plot
plotCoefficientsRF.fnc(importance = varimps, color = "#CD5555", 
                       title = "segment duration")

One regression tree for visualisation

# Random forest model with the most important variables that varies minimally in R squared
N400_SYN_small = ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent + AoE + Digit + AoA, data = N400_rf, num.trees = 75, mtry = 8, 
                     importance = "permutation")
N400_SYN_small
## Ranger result
## 
## Call:
##  ranger(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years +      Expos_percent + AoE + Digit + AoA, data = N400_rf, num.trees = 75,      mtry = 8, importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  75 
## Sample size:                      228 
## Number of independent variables:  8 
## Mtry:                             8 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.3280273 
## R squared (OOB):                  0.7908182
# Tree using the ctree function with those predictors
SYN_N400.ctree = party::ctree(N400_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit, data = N400_rf)
plot(SYN_N400.ctree, type = "simple")

3.3. The P600 effect (at P3-Pz-P4)

Determine optimal hyperparameters

Use the train() function to define optimal mtry and num.trees values

# Define grid for the number of trees and mtry
num_trees = round(exp(seq(1, 10, by = 0.5)))
num_trees
##  [1]     3     4     7    12    20    33    55    90   148   245   403
## [12]   665  1097  1808  2981  4915  8103 13360 22026
m_try = seq.int(from = 1, to = 9, by = 1)
m_try
## [1] 1 2 3 4 5 6 7 8 9
tuneGrid = expand.grid(mtry = m_try, num.trees = num_trees)
# Define the cross-validation parameters
control = trainControl(method = "cv", number = 10)
# Run the grid search using the train() function of the caret package
P600_train1 = train(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Determine the best tune within the coarse search
P600_train1$bestTune
##    mtry num.trees
## 80    5        12
# Define number of trees for a finer search
num_trees = c(55,60,65,70,75,80,85,90)
num_trees
## [1] 55 60 65 70 75 80 85 90
# Run the grid with the appropriate number of mtry
tuneGrid = expand.grid(mtry = 9, num.trees = num_trees)
# Train the model for the finer search
P600_train2 = train(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, method = ranger_type$ranger, trControl = control, tuneGrid = tuneGrid, verbose = TRUE)
# Get the results
P600_train2$results
##   mtry num.trees      RMSE  Rsquared       MAE    RMSESD RsquaredSD
## 1    9        55 0.8702288 0.8638882 0.6761300 0.1300984 0.04975262
## 2    9        60 0.8738700 0.8655279 0.6834018 0.1694993 0.04616770
## 3    9        65 0.8324870 0.8797935 0.6423902 0.1415153 0.02757596
## 4    9        70 0.8358277 0.8752699 0.6509530 0.1264561 0.04104217
## 5    9        75 0.8446587 0.8718606 0.6622036 0.1364608 0.04286072
## 6    9        80 0.8541157 0.8726248 0.6684847 0.1484835 0.03729327
## 7    9        85 0.8561210 0.8715816 0.6610140 0.1386923 0.03797779
## 8    9        90 0.8477884 0.8715780 0.6575440 0.1217036 0.03999921
##        MAESD
## 1 0.10454606
## 2 0.14152867
## 3 0.11841205
## 4 0.10383647
## 5 0.11360874
## 6 0.12086884
## 7 0.10409330
## 8 0.09533142
# Get the optimal parameters
P600_train2$bestTune
##   mtry num.trees
## 3    9        65

Random forest model

Run the actual model using the ranger() function with the optimal hyperparameters
P600_SYN_rf = ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + AoA + AoE + Expos_percent + Digit + Group, data = P600_rf, num.trees = 85, mtry = 9, importance = "permutation")
# Display the results
P600_SYN_rf
## Ranger result
## 
## Call:
##  ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years +      AoA + AoE + Expos_percent + Digit + Group, data = P600_rf,      num.trees = 85, mtry = 9, importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  85 
## Sample size:                      228 
## Number of independent variables:  9 
## Mtry:                             9 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.5555332 
## R squared (OOB):                  0.898504

Plot variable importance

varimps = round(ranger::importance(P600_SYN_rf),3)


# Show variable importance
rev(sort(varimps))
## Expos_percent           Lex      perf_SYN          Cloz   Expos_years 
##        15.473         5.560         1.846         1.302         0.987 
##           AoE         Digit           AoA         Group 
##         0.752         0.585         0.218         0.095
# Define plot function
plotCoefficientsRF.fnc = function(importance, color = "#CD5555",title="") {
  
  # Order importance
  importance = importance[order(importance)]
  
  # Define predictor names
  predictors = names(importance)
  
  # Define colors for non-zero and zero importance
  colors = c(color,paste0(color,"66"))
  
  # Plot
  dotplot(importance, labels = predictors,
          xlab = "variable importance", ylab = "predictors",
          col = colors[as.numeric(importance==0)+1], pch = 19,
          scales = list(x = list(at = seq(-1, 20, by = 1),
                                 labels = round(seq(-1, 20, by = 1),1))),
          panel = function(...) {
            panel.abline(v = 0, lty = "dotted", col = "black")
            panel.dotplot(...)
          },
          par.settings = list(fontsize = list(text = 12, points = 10))
  )
  
}

# Set plotting parameters
par(mfrow = c(1, 1))

# Plot
plotCoefficientsRF.fnc(importance = varimps, color = "#CD5555", 
                       title = "segment duration")

One regression tree for visualisation

# Random forest model with the most important variables that varies minimally in R squared
P600_SYN_small = ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years + Expos_percent, data = P600_rf, num.trees = 85, mtry = 5, importance = "permutation")
P600_SYN_small
## Ranger result
## 
## Call:
##  ranger(P600_SYN_diff ~ perf_SYN + Cloz + Lex + Expos_years +      Expos_percent, data = P600_rf, num.trees = 85, mtry = 5,      importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  85 
## Sample size:                      228 
## Number of independent variables:  5 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.6073759 
## R squared (OOB):                  0.8890324
# Tree using the ctree function with those predictors
SYN_P600.ctree = party::ctree(P600_SYN_diff ~perf_SYN + Cloz + Lex + Expos_years + Expos_percent, data = P600_rf)
plot(SYN_P600.ctree, type = "simple")