This notebook collects log data from the second series of SQuAD non-PCE runs and constructs a data frame with hyperparameters and performance results. A simple linear regression model is then used to evaluate how significant each of the hyperparameters are in determining the F1 score.
base_dir= 'C:/Users/mjkam/repos/GITHUB/squadNonPCE/'
workDir = path(base_dir,'save2/train/')
runDirs = dir(workDir)
Log directory: C:/Users/mjkam/repos/GITHUB/squadNonPCE/save2/train
Log count: 20`
The log directories are named as a concatenation of the following:
For example: GRU210-CET-SAF-04
Parse the file log.txt in each of the run directories.
Extract the arguments from the first 100 lines of each log.
Extract the last occurrence of Dev results.
The “ParseLog” function below reads the top of the log where all arguments and values are listed, and the last row where evaluation metrics are reported. By parsing these lines it extracts a superset of the hyperparameters and metrics used for this analysis.
ParseLog <- function(workDir, runDir){
f <- path(workDir,runDir,'log.txt')
d <- sapply(f, FUN = function(x) read_lines(tail(x)))
l <- length(d)
arg_ix_start <- grep("Args",d[1:100])
arg_ix_end <- grep("}",d[arg_ix_start:100])
d1 <- d[(arg_ix_start+1):(arg_ix_end-1)]
gpu_id_start <- grep("gpu_ids",d1)
gpu_id_len <- grep("]",d1[gpu_id_start:length(d1)])
d1 = gsub('\"','',d1)
d1 = gsub(' |,||^[.*]','',d1)
d2 = paste(d1[gpu_id_start:(gpu_id_start+gpu_id_len-1)],collapse="")
d3 <- strsplit(c(d1[1:(gpu_id_start-1)],d2,d1[(gpu_id_start+gpu_id_len):length(d1)]),":")
arg_final_ix <- last(grep("Dev NLL:",d))
d4 <- strsplit(d[arg_final_ix],"] ")
d5 <- strsplit(d4[[1]][2],",")
d6 <- flatten(lapply(d5, FUN=function(x) strsplit(x,":")))
hparms = sapply(c(d3,d6),FUN=function(x) x[2])
hparms = c(runDir,hparms)
names(hparms) = c('run_dir',sapply(c(d3,d6), FUN=function(x) x[1]))
return(hparms)
}
Applying this log extraction function to all run directories we get a list of named character vectors for each log.
log = lapply(runDirs,FUN = function(x) ParseLog(workDir, x))
Since there may be missing values or different arguments in each log, we collect the union of all hyperparameters across all logs, replace missing hyperparameters with defaults, and convert the list of vectors to a data frame.
We then cast numeric and logical values from character strings to their appropriate type. The resulting data frame “data” contains all arguments passed into the “train.py” script and the last evaluation metrics.
#union(sapply(df,FUN=function(x) names(x)))
all_names <- unique(do.call(c,lapply(log,FUN=function(x) names(x))))
data <- do.call(rbind,lapply(log,FUN=function(x) {
missing <- setdiff(all_names, names(x)) # Find names of missing columns
x[missing] <- '0' # Add them, filled with '0's
return(x[all_names])
}))
data <- data.frame(data) %>%
rename(Dev_NLL=Dev.NLL,
Dev_F1 = X.F1,
Dev_EM = X.EM,
Dev_AvNA = X.AvNA,
) %>%
mutate(
batch_size = as.numeric(batch_size),
char_embeddings = (substr(tolower(char_embeddings),1,1) == "t"),
char_kernel_size = as.numeric(char_kernel_size),
char_out_channels = as.numeric(char_out_channels),
drop_prob = as.numeric(drop_prob),
ema_decay = as.numeric(ema_decay),
eval_steps = as.numeric(eval_steps),
hidden_size = as.numeric(hidden_size),
l2_wd = as.numeric(l2_wd),
lr = as.numeric(lr),
max_ans_len = as.numeric(max_ans_len),
max_checkpoints = as.numeric(max_checkpoints),
max_grad_norm = as.numeric(max_grad_norm),
maximize_metric = (substr(tolower(maximize_metric),1,1) == "t"),
num_epochs = as.numeric(num_epochs),
num_visuals = as.numeric(num_visuals),
num_workers = as.numeric(num_workers),
seed = as.numeric(seed),
self_att = (substr(tolower(self_att),1,1) == "t"),
use_squad_v2 = (substr(tolower(use_squad_v2),1,1) == "t"),
Dev_NLL = as.numeric(Dev_NLL),
Dev_F1 = as.numeric(Dev_F1),
Dev_EM = as.numeric(Dev_EM),
Dev_AvNA = as.numeric(Dev_AvNA),
rnn_layers_enc = as.numeric(rnn_layers_enc),
rnn_layers_mod = as.numeric(rnn_layers_mod),
rnn_layers_mod2 = as.numeric(rnn_layers_mod2)
)
Runs that are interrupted and restarted or for any other reason extended from the last step forward by loading the last known weights appear as two records in our data frame. We want to remove the row representing early legs, and report only the last leg of the run.
extended_runs <- data %>%
filter(load_path!='null') %>%
select(save_dir,load_path,num_epochs,Dev_F1) %>%
mutate(load_path = paste0("./",gsub("/step.*","",load_path)))
as.list(extended_runs['load_path'])
$load_path
character(0)
To implement this logic, we take the anti-join between non-empty load-path directories and all run directories. The anti-join drops all directories matching load paths, retaining only directories that have not been used as load paths.
For this set of runs we exclude the RNN Type, the Self-Attention and Character Embedding indicators since they are the same for all runs.
df <- data %>%
anti_join(extended_runs, by = c("save_dir" = "load_path")) %>%
select(save_dir,num_epochs,lr, l2_wd, drop_prob,rnn_layers_enc,
rnn_layers_mod, rnn_layers_mod2, Dev_F1,save_dir)
df
fit <- lm(Dev_F1 ~
lr + I(lr**2) +
I(log(1-l2_wd)) + I(log(1-l2_wd)**2) +
I(drop_prob) + I(drop_prob**2) +
rnn_layers_enc + rnn_layers_mod ,
df)
#summary(fit)
step(fit, direction = "backward"#, test = "F"
)
Start: AIC=-1.97
Dev_F1 ~ lr + I(lr^2) + I(log(1 - l2_wd)) + I(log(1 - l2_wd)^2) +
I(drop_prob) + I(drop_prob^2) + rnn_layers_enc + rnn_layers_mod
Df Sum of Sq RSS AIC
- I(log(1 - l2_wd)^2) 1 0.003 7.370 -3.965
- rnn_layers_enc 1 0.530 7.898 -2.583
- I(drop_prob^2) 1 0.545 7.913 -2.544
- I(log(1 - l2_wd)) 1 0.654 8.022 -2.271
<none> 7.368 -1.972
- rnn_layers_mod 1 2.236 9.604 1.328
- I(drop_prob) 1 3.409 10.776 3.633
- I(lr^2) 1 20.342 27.710 22.521
- lr 1 39.228 46.596 32.915
Step: AIC=-3.97
Dev_F1 ~ lr + I(lr^2) + I(log(1 - l2_wd)) + I(drop_prob) + I(drop_prob^2) +
rnn_layers_enc + rnn_layers_mod
Df Sum of Sq RSS AIC
- I(drop_prob^2) 1 0.592 7.963 -4.419
- rnn_layers_enc 1 0.618 7.989 -4.354
<none> 7.370 -3.965
- rnn_layers_mod 1 2.237 9.607 -0.664
- I(drop_prob) 1 3.464 10.834 1.739
- I(lr^2) 1 20.381 27.752 20.551
- lr 1 39.237 46.607 30.920
- I(log(1 - l2_wd)) 1 56.441 63.811 37.204
Step: AIC=-4.42
Dev_F1 ~ lr + I(lr^2) + I(log(1 - l2_wd)) + I(drop_prob) + rnn_layers_enc +
rnn_layers_mod
Df Sum of Sq RSS AIC
<none> 7.963 -4.419
- rnn_layers_mod 1 1.723 9.686 -2.501
- rnn_layers_enc 1 1.881 9.844 -2.178
- I(lr^2) 1 20.185 28.148 18.835
- I(drop_prob) 1 22.543 30.505 20.443
- lr 1 38.894 46.856 29.027
- I(log(1 - l2_wd)) 1 61.739 69.702 36.970
Call:
lm(formula = Dev_F1 ~ lr + I(lr^2) + I(log(1 - l2_wd)) + I(drop_prob) +
rnn_layers_enc + rnn_layers_mod, data = df)
Coefficients:
(Intercept) lr I(lr^2) I(log(1 - l2_wd)) I(drop_prob)
52.301 28.999 -25.752 -9176.239 14.549
rnn_layers_enc rnn_layers_mod
1.127 1.063
The stepwise regression shown above starts with a linear model with all variables and progressively removes the least significant ones. The variables used are:
The next sequence attempts selecting the most significant variables using both backward and forward optimization. This process starts with a minimal model, and at each step the AIC score is calculated for each potential variable addition or existing variable removal from the model. The action chosen is the one that reduces the AIC score the most:
Start: AIC=166.33
Dev_F1 ~ 0
Df Sum of Sq RSS AIC F value Pr(>F)
+ rnn_layers_mod 1 73811 8002 121.83 175.248 4.850e-11 ***
+ rnn_layers_enc 1 72360 9453 125.17 145.442 2.382e-10 ***
+ lr 1 59027 22786 142.76 49.220 1.112e-06 ***
+ I(drop_prob) 1 49002 32811 150.06 28.376 3.851e-05 ***
+ I(lr^2) 1 41158 40655 154.34 19.235 0.0003178 ***
+ I(drop_prob^2) 1 39558 42255 155.12 17.788 0.0004663 ***
+ I(log(1 - l2_wd)) 1 37623 44190 156.01 16.177 0.0007286 ***
+ I(log(1 - l2_wd)^2) 1 28705 53108 159.69 10.270 0.0046654 **
<none> 81813 166.33
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=121.84
Dev_F1 ~ rnn_layers_mod - 1
Df Sum of Sq RSS AIC F value Pr(>F)
+ lr 1 3174 4829 113.73 11.8314 0.002923 **
+ I(lr^2) 1 2096 5906 117.76 6.3895 0.021055 *
+ I(drop_prob) 1 1130 6872 120.79 2.9605 0.102464
+ I(drop_prob^2) 1 969 7033 121.25 2.4795 0.132749
+ I(log(1 - l2_wd)^2) 1 837 7165 121.62 2.1035 0.164163
+ I(log(1 - l2_wd)) 1 774 7229 121.80 1.9262 0.182114
<none> 8002 121.83
+ rnn_layers_enc 1 633 7369 122.19 1.5465 0.229603
- rnn_layers_mod 1 73811 81813 166.33 175.2484 4.85e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=113.73
Dev_F1 ~ rnn_layers_mod + lr - 1
Df Sum of Sq RSS AIC F value Pr(>F)
+ rnn_layers_enc 1 1499.2 3329.3 108.30 7.6553 0.013194 *
+ I(drop_prob) 1 1378.7 3449.8 109.01 6.7940 0.018429 *
+ I(log(1 - l2_wd)) 1 1334.0 3494.6 109.27 6.4895 0.020819 *
+ I(lr^2) 1 1306.4 3522.2 109.42 6.3052 0.022436 *
+ I(log(1 - l2_wd)^2) 1 1145.5 3683.0 110.31 5.2875 0.034424 *
+ I(drop_prob^2) 1 907.6 3920.9 111.57 3.9352 0.063676 .
<none> 4828.6 113.73
- lr 1 3173.8 8002.4 121.83 11.8314 0.002923 **
- rnn_layers_mod 1 17957.3 22785.8 142.76 66.9414 1.777e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=108.3
Dev_F1 ~ rnn_layers_mod + lr + rnn_layers_enc - 1
Df Sum of Sq RSS AIC F value Pr(>F)
+ I(lr^2) 1 1172.9 2156.4 101.61 8.7030 0.0094090 **
+ I(drop_prob) 1 1061.0 2268.4 102.62 7.4836 0.0146608 *
+ I(drop_prob^2) 1 988.6 2340.8 103.25 6.7573 0.0193609 *
+ I(log(1 - l2_wd)) 1 972.6 2356.7 103.39 6.6031 0.0205691 *
+ I(log(1 - l2_wd)^2) 1 914.5 2414.8 103.87 6.0592 0.0255742 *
- rnn_layers_mod 1 112.1 3441.5 106.96 0.5725 0.4596128
<none> 3329.3 108.30
- rnn_layers_enc 1 1499.2 4828.6 113.73 7.6553 0.0131943 *
- lr 1 4039.9 7369.2 122.19 20.6282 0.0002888 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=101.61
Dev_F1 ~ rnn_layers_mod + lr + rnn_layers_enc + I(lr^2) - 1
Df Sum of Sq RSS AIC F value Pr(>F)
+ I(log(1 - l2_wd)) 1 555.96 1600.4 97.646 5.2107 0.0374543 *
+ I(log(1 - l2_wd)^2) 1 533.06 1623.3 97.930 4.9256 0.0423001 *
+ I(drop_prob) 1 502.40 1654.0 98.304 4.5562 0.0497100 *
+ I(drop_prob^2) 1 452.58 1703.8 98.898 3.9844 0.0644075 .
- rnn_layers_mod 1 0.74 2157.1 99.616 0.0055 0.9419486
<none> 2156.4 101.609
- I(lr^2) 1 1172.94 3329.3 108.296 8.7030 0.0094090 **
- rnn_layers_enc 1 1365.80 3522.2 109.422 10.1340 0.0057758 **
- lr 1 2410.68 4567.1 114.618 17.8868 0.0006382 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=97.65
Dev_F1 ~ rnn_layers_mod + lr + rnn_layers_enc + I(lr^2) + I(log(1 -
l2_wd)) - 1
Df Sum of Sq RSS AIC F value Pr(>F)
+ I(drop_prob) 1 299.22 1301.2 95.506 3.2194 0.0943885 .
+ I(drop_prob^2) 1 294.29 1306.1 95.582 3.1544 0.0974544 .
- rnn_layers_mod 1 0.09 1600.5 95.647 0.0009 0.9767836
<none> 1600.4 97.646
+ I(log(1 - l2_wd)^2) 1 7.10 1593.3 99.557 0.0624 0.8063901
- I(log(1 - l2_wd)) 1 555.96 2156.4 101.609 5.2107 0.0374543 *
- I(lr^2) 1 756.30 2356.7 103.386 7.0884 0.0177460 *
- rnn_layers_enc 1 1110.95 2711.4 106.190 10.4124 0.0056451 **
- lr 1 1859.11 3459.5 111.063 17.4245 0.0008142 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=95.51
Dev_F1 ~ rnn_layers_mod + lr + rnn_layers_enc + I(lr^2) + I(log(1 -
l2_wd)) + I(drop_prob) - 1
Df Sum of Sq RSS AIC F value Pr(>F)
- rnn_layers_mod 1 0.25 1301.5 93.510 0.0027 0.959057
<none> 1301.2 95.506
+ I(log(1 - l2_wd)^2) 1 29.20 1272.0 97.052 0.2984 0.594116
+ I(drop_prob^2) 1 1.68 1299.5 97.480 0.0168 0.898717
- I(drop_prob) 1 299.22 1600.4 97.646 3.2194 0.094388 .
- I(log(1 - l2_wd)) 1 352.79 1654.0 98.304 3.7957 0.071718 .
- I(lr^2) 1 461.76 1763.0 99.580 4.9682 0.042715 *
- rnn_layers_enc 1 1010.07 2311.3 104.997 10.8676 0.005298 **
- lr 1 1346.60 2647.8 107.715 14.4883 0.001926 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=93.51
Dev_F1 ~ lr + rnn_layers_enc + I(lr^2) + I(log(1 - l2_wd)) +
I(drop_prob) - 1
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 1301.5 93.510
+ I(log(1 - l2_wd)^2) 1 29.4 1272.0 95.052 0.3241 0.5781574
+ I(drop_prob^2) 1 1.1 1300.4 95.493 0.0119 0.9145407
+ rnn_layers_mod 1 0.3 1301.2 95.506 0.0027 0.9590570
- I(drop_prob) 1 299.1 1600.5 95.647 3.4468 0.0831232 .
- I(log(1 - l2_wd)) 1 352.6 1654.1 96.305 4.0640 0.0620818 .
- I(lr^2) 1 483.3 1784.7 97.826 5.5699 0.0322415 *
- lr 1 1504.2 2805.7 106.873 17.3367 0.0008317 ***
- rnn_layers_enc 1 3949.9 5251.4 119.410 45.5250 6.555e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Call:
lm(formula = Dev_F1 ~ lr + rnn_layers_enc + I(lr^2) + I(log(1 -
l2_wd)) + I(drop_prob) - 1, data = df)
Coefficients:
lr rnn_layers_enc I(lr^2) I(log(1 - l2_wd)) I(drop_prob)
132.72 19.79 -111.28 -21034.90 50.09
As shown by the results, the most significant predictors of F1 are associated with a scond degree function of the learning rate, L2 weight decay squared, the drop probability and the number of encoder RNN layers.
df %>% arrange(-Dev_F1) %>% top_n(10,Dev_F1)
fit = lm(formula = Dev_F1 ~ lr + rnn_layers_enc + I(lr^2) + I(log(1 -
l2_wd)) + I(drop_prob) - 1, data = df)
summary(fit)
Call:
lm(formula = Dev_F1 ~ lr + rnn_layers_enc + I(lr^2) + I(log(1 -
l2_wd)) + I(drop_prob) - 1, data = df)
Residuals:
Min 1Q Median 3Q Max
-21.374 -3.111 1.488 4.947 16.853
Coefficients:
Estimate Std. Error t value Pr(>|t|)
lr 132.724 31.876 4.164 0.000832 ***
rnn_layers_enc 19.785 2.932 6.747 6.56e-06 ***
I(lr^2) -111.284 47.153 -2.360 0.032241 *
I(log(1 - l2_wd)) -21034.905 10434.252 -2.016 0.062082 .
I(drop_prob) 50.093 26.982 1.857 0.083123 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.315 on 15 degrees of freedom
Multiple R-squared: 0.9841, Adjusted R-squared: 0.9788
F-statistic: 185.6 on 5 and 15 DF, p-value: 6.192e-13
plot(fit)