This notebook collects log data from the first series of SQuAD non-PCE runs and constructs a data frame with hyperparameters and performance results. A simple linear regression model is then used to evaluate how significant each of the hyperparameters are in determining the F1 score.
base_dir= 'C:/Users/mjkam/repos/GITHUB/squadNonPCE/'
workDir = path(base_dir,'save1/train/')
runDirs = dir(workDir)
Log directory: C:/Users/mjkam/repos/GITHUB/squadNonPCE/save1/train
Log count: 60`
The log directories are named as a concatenation of the following:
A GRU example: GRU211-CET-SAF-01 and an LSTM one: LSTM112-CEF-SAT-02
Parse the file log.txt in each of the run directories.
Extract the arguments from the first 100 lines of each log.
Extract the last occurrence of Dev results.
The “ParseLog” function below reads the top of the log where all arguments and values are listed, and the last row where evaluation metrics are reported. By parsing these lines it extracts a superset of the hyperparameters and metrics used for this analysis.
ParseLog <- function(workDir, runDir){
f <- path(workDir,runDir,'log.txt')
d <- sapply(f, FUN = function(x) read_lines(tail(x)))
l <- length(d)
arg_ix_start <- grep("Args",d[1:100])
arg_ix_end <- grep("}",d[arg_ix_start:100])
d1 <- d[(arg_ix_start+1):(arg_ix_end-1)]
gpu_id_start <- grep("gpu_ids",d1)
gpu_id_len <- grep("]",d1[gpu_id_start:length(d1)])
d1 = gsub('\"','',d1)
d1 = gsub(' |,||^[.*]','',d1)
d2 = paste(d1[gpu_id_start:(gpu_id_start+gpu_id_len-1)],collapse="")
d3 <- strsplit(c(d1[1:(gpu_id_start-1)],d2,d1[(gpu_id_start+gpu_id_len):length(d1)]),":")
arg_final_ix <- last(grep("Dev NLL:",d))
d4 <- strsplit(d[arg_final_ix],"] ")
d5 <- strsplit(d4[[1]][2],",")
d6 <- flatten(lapply(d5, FUN=function(x) strsplit(x,":")))
hparms = sapply(c(d3,d6),FUN=function(x) x[2])
hparms = c(runDir,hparms)
names(hparms) = c('run_dir',sapply(c(d3,d6), FUN=function(x) x[1]))
return(hparms)
}
Applying this log extraction function to all run directories we get a list of named character vectors for each log.
log = lapply(runDirs,FUN = function(x) ParseLog(workDir, x))
Since there may be missing values or different arguments in each log, we collect the union of all hyperparameters across all logs, replace missing hyperparameters with defaults, and convert the list of vectors to a data frame.
We then cast numeric and logical values from character strings to their appropriate type. The resulting data frame “data” contains all arguments passed into the “train.py” script and the last evaluation metrics.
#union(sapply(df,FUN=function(x) names(x)))
all_names <- unique(do.call(c,lapply(log,FUN=function(x) names(x))))
data <- do.call(rbind,lapply(log,FUN=function(x) {
missing <- setdiff(all_names, names(x)) # Find names of missing columns
x[missing] <- '0' # Add them, filled with '0's
return(x[all_names])
}))
data <- data.frame(data) %>%
rename(Dev_NLL=Dev.NLL,
Dev_F1 = X.F1,
Dev_EM = X.EM,
Dev_AvNA = X.AvNA,
) %>%
mutate(
batch_size = as.numeric(batch_size),
char_embeddings = (substr(tolower(char_embeddings),1,1) == "t"),
char_kernel_size = as.numeric(char_kernel_size),
char_out_channels = as.numeric(char_out_channels),
drop_prob = as.numeric(drop_prob),
ema_decay = as.numeric(ema_decay),
eval_steps = as.numeric(eval_steps),
hidden_size = as.numeric(hidden_size),
l2_wd = as.numeric(l2_wd),
lr = as.numeric(lr),
max_ans_len = as.numeric(max_ans_len),
max_checkpoints = as.numeric(max_checkpoints),
max_grad_norm = as.numeric(max_grad_norm),
maximize_metric = (substr(tolower(maximize_metric),1,1) == "t"),
num_epochs = as.numeric(num_epochs),
num_visuals = as.numeric(num_visuals),
num_workers = as.numeric(num_workers),
seed = as.numeric(seed),
self_att = (substr(tolower(self_att),1,1) == "t"),
use_squad_v2 = (substr(tolower(use_squad_v2),1,1) == "t"),
Dev_NLL = as.numeric(Dev_NLL),
Dev_F1 = as.numeric(Dev_F1),
Dev_EM = as.numeric(Dev_EM),
Dev_AvNA = as.numeric(Dev_AvNA),
rnn_layers_enc = as.numeric(rnn_layers_enc),
rnn_layers_mod = as.numeric(rnn_layers_mod),
rnn_layers_mod2 = as.numeric(rnn_layers_mod2)
)
Runs that are interrupted and restarted or for any other reason extended from the last step forward by loading the last known weights appear as two records in our data frame. We want to remove the row representing early legs, and report only the last leg of the run.
extended_runs <- data %>%
filter(load_path!='null') %>%
select(save_dir,load_path,num_epochs,Dev_F1) %>%
mutate(load_path = paste0("./",gsub("/step.*","",load_path)))
as.list(extended_runs['load_path'])
$load_path
character(0)
To implement this logic, we take the anti-join between non-empty load-path directories and all run directories. The anti-join drops all directories matching load paths, retaining only directories that have not been used as load paths.
df <- data %>%
anti_join(extended_runs, by = c("save_dir" = "load_path")) %>%
mutate(GRU = (rnn_type == 'GRU')) %>%
select(save_dir,num_epochs,char_embeddings,self_att,lr, l2_wd, drop_prob,GRU,rnn_layers_enc,
rnn_layers_mod, rnn_layers_mod2, Dev_F1,save_dir)
df
fit <- lm(Dev_F1 ~ char_embeddings +
lr + I(lr**2) +
I(log(1-l2_wd)) + I(log(1-l2_wd)**2) +
I(drop_prob) + I(drop_prob**2) +
GRU +
rnn_layers_enc + rnn_layers_mod + I(rnn_layers_mod2*self_att),
df)
#summary(fit)
step(fit, direction = "backward"#, test = "F"
)
Start: AIC=126.6
Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 - l2_wd)) +
I(log(1 - l2_wd)^2) + I(drop_prob) + I(drop_prob^2) + GRU +
rnn_layers_enc + rnn_layers_mod + I(rnn_layers_mod2 * self_att)
Df Sum of Sq RSS AIC
- I(drop_prob) 1 0.885 332.61 124.76
- I(rnn_layers_mod2 * self_att) 1 2.557 334.28 125.06
- GRU 1 4.562 336.29 125.42
- I(log(1 - l2_wd)^2) 1 5.472 337.19 125.58
- I(log(1 - l2_wd)) 1 6.035 337.76 125.68
<none> 331.72 126.60
- rnn_layers_mod 1 13.121 344.84 126.92
- I(lr^2) 1 13.587 345.31 127.01
- lr 1 20.069 351.79 128.12
- I(drop_prob^2) 1 21.950 353.67 128.44
- rnn_layers_enc 1 28.621 360.34 129.56
- char_embeddings 1 33.975 365.70 130.45
Step: AIC=124.76
Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 - l2_wd)) +
I(log(1 - l2_wd)^2) + I(drop_prob^2) + GRU + rnn_layers_enc +
rnn_layers_mod + I(rnn_layers_mod2 * self_att)
Df Sum of Sq RSS AIC
- I(rnn_layers_mod2 * self_att) 1 2.202 334.81 123.15
- GRU 1 4.741 337.35 123.61
- I(log(1 - l2_wd)^2) 1 5.573 338.18 123.75
- I(log(1 - l2_wd)) 1 5.788 338.40 123.79
<none> 332.61 124.76
- rnn_layers_mod 1 12.708 345.32 125.01
- I(lr^2) 1 15.373 347.98 125.47
- lr 1 21.969 354.58 126.59
- rnn_layers_enc 1 27.954 360.56 127.60
- char_embeddings 1 33.292 365.90 128.48
- I(drop_prob^2) 1 207.559 540.17 151.85
Step: AIC=123.15
Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 - l2_wd)) +
I(log(1 - l2_wd)^2) + I(drop_prob^2) + GRU + rnn_layers_enc +
rnn_layers_mod
Df Sum of Sq RSS AIC
- GRU 1 5.012 339.82 122.04
- I(log(1 - l2_wd)) 1 5.222 340.03 122.08
- I(log(1 - l2_wd)^2) 1 5.744 340.55 122.17
<none> 334.81 123.15
- rnn_layers_mod 1 12.611 347.42 123.37
- I(lr^2) 1 15.879 350.69 123.93
- lr 1 22.428 357.24 125.04
- rnn_layers_enc 1 26.017 360.83 125.64
- char_embeddings 1 33.027 367.84 126.80
- I(drop_prob^2) 1 205.412 540.22 149.86
Step: AIC=122.04
Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 - l2_wd)) +
I(log(1 - l2_wd)^2) + I(drop_prob^2) + rnn_layers_enc + rnn_layers_mod
Df Sum of Sq RSS AIC
- I(log(1 - l2_wd)^2) 1 3.984 343.81 120.74
- I(log(1 - l2_wd)) 1 7.247 347.07 121.31
<none> 339.82 122.04
- rnn_layers_mod 1 11.852 351.67 122.10
- I(lr^2) 1 13.834 353.65 122.44
- lr 1 20.116 359.94 123.50
- rnn_layers_enc 1 26.463 366.28 124.54
- char_embeddings 1 37.265 377.09 126.29
- I(drop_prob^2) 1 205.724 545.54 148.45
Step: AIC=120.74
Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 - l2_wd)) +
I(drop_prob^2) + rnn_layers_enc + rnn_layers_mod
Df Sum of Sq RSS AIC
<none> 343.81 120.74
- rnn_layers_mod 1 13.633 357.44 121.08
- I(lr^2) 1 16.836 360.64 121.61
- lr 1 24.187 367.99 122.82
- rnn_layers_enc 1 27.861 371.67 123.42
- char_embeddings 1 35.084 378.89 124.57
- I(log(1 - l2_wd)) 1 200.171 543.98 146.27
- I(drop_prob^2) 1 271.538 615.34 153.67
Call:
lm(formula = Dev_F1 ~ char_embeddings + lr + I(lr^2) + I(log(1 -
l2_wd)) + I(drop_prob^2) + rnn_layers_enc + rnn_layers_mod,
data = df)
Coefficients:
(Intercept) char_embeddingsTRUE lr I(lr^2)
76.8665 1.6167 -21.4383 10.5691
I(log(1 - l2_wd)) I(drop_prob^2) rnn_layers_enc rnn_layers_mod
2116.9768 -48.3220 -1.4688 -0.9724
The stepwise regression shown above starts with a linear model with all variables and progressively removes the least significant ones. The variables used are:
The next sequence attempts selecting the most significant variables using both backward and forward optimization. This process starts with a minimal model, and at each step the AIC score is calculated for each potential variable addition or existing variable removal from the model. The action chosen is the one that reduces the AIC score the most:
Start: AIC=168.67
Dev_F1 ~ char_embeddings
Df Sum of Sq RSS AIC F value Pr(>F)
+ I(drop_prob^2) 1 310.078 623.28 146.44 28.3570 1.777e-06 ***
+ I(drop_prob) 1 258.438 674.92 151.22 21.8261 1.865e-05 ***
+ I(log(1 - l2_wd)^2) 1 253.357 680.00 151.66 21.2372 2.331e-05 ***
+ I(log(1 - l2_wd)) 1 163.511 769.85 159.11 12.1064 0.0009702 ***
<none> 933.36 168.67
+ rnn_layers_enc 1 24.036 909.33 169.10 1.5067 0.2246950
+ rnn_layers_mod 1 20.997 912.36 169.30 1.3118 0.2568559
+ I(rnn_layers_mod2 * self_att) 1 12.556 920.81 169.85 0.7772 0.3816882
+ lr 1 6.733 926.63 170.23 0.4142 0.5224296
+ GRU 1 4.460 928.90 170.38 0.2737 0.6029124
- char_embeddings 1 61.177 994.54 170.48 3.8016 0.0560435 .
+ I(lr^2) 1 2.928 930.43 170.48 0.1794 0.6735252
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=146.44
Dev_F1 ~ char_embeddings + I(drop_prob^2)
Df Sum of Sq RSS AIC F value Pr(>F)
+ I(log(1 - l2_wd)^2) 1 158.466 464.82 130.84 19.0916 5.448e-05 ***
+ I(log(1 - l2_wd)) 1 134.988 488.30 133.79 15.4811 0.0002324 ***
+ lr 1 39.570 583.71 144.50 3.7962 0.0563869 .
+ I(lr^2) 1 33.015 590.27 145.17 3.1322 0.0822035 .
+ rnn_layers_mod 1 21.763 601.52 146.31 2.0261 0.1601685
<none> 623.28 146.44
+ rnn_layers_enc 1 16.090 607.19 146.87 1.4840 0.2282604
- char_embeddings 1 34.094 657.38 147.63 3.1179 0.0827901 .
+ I(rnn_layers_mod2 * self_att) 1 3.983 619.30 148.06 0.3602 0.5508181
+ I(drop_prob) 1 1.423 621.86 148.30 0.1281 0.7217145
+ GRU 1 0.005 623.28 148.44 0.0004 0.9836896
- I(drop_prob^2) 1 310.078 933.36 168.67 28.3570 1.777e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=130.84
Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 - l2_wd)^2)
Df Sum of Sq RSS AIC F value Pr(>F)
+ lr 1 61.697 403.12 124.29 8.4177 0.005334 **
+ I(lr^2) 1 53.192 411.63 125.55 7.1074 0.010059 *
+ rnn_layers_enc 1 45.793 419.02 126.61 6.0107 0.017423 *
+ rnn_layers_mod 1 19.667 445.15 130.24 2.4300 0.124771
<none> 464.82 130.84
+ GRU 1 2.411 462.41 132.53 0.2868 0.594466
+ I(log(1 - l2_wd)) 1 0.730 464.09 132.74 0.0865 0.769790
+ I(drop_prob) 1 0.328 464.49 132.80 0.0388 0.844605
+ I(rnn_layers_mod2 * self_att) 1 0.040 464.78 132.83 0.0048 0.945073
- char_embeddings 1 32.830 497.65 132.93 3.9553 0.051620 .
- I(log(1 - l2_wd)^2) 1 158.466 623.28 146.44 19.0916 5.448e-05 ***
- I(drop_prob^2) 1 215.187 680.00 151.66 25.9252 4.309e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=124.29
Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 - l2_wd)^2) +
lr
Df Sum of Sq RSS AIC F value Pr(>F)
+ rnn_layers_enc 1 34.014 369.11 121.00 4.9762 0.029873 *
+ I(lr^2) 1 20.301 382.82 123.19 2.8636 0.096369 .
+ rnn_layers_mod 1 14.169 388.95 124.15 1.9672 0.166469
<none> 403.12 124.29
+ GRU 1 2.589 400.53 125.91 0.3490 0.557137
+ I(drop_prob) 1 0.840 402.28 126.17 0.1128 0.738259
+ I(log(1 - l2_wd)) 1 0.508 402.61 126.22 0.0682 0.794990
+ I(rnn_layers_mod2 * self_att) 1 0.250 402.87 126.26 0.0335 0.855495
- char_embeddings 1 37.688 440.81 127.66 5.1419 0.027305 *
- lr 1 61.697 464.82 130.84 8.4177 0.005334 **
- I(log(1 - l2_wd)^2) 1 180.594 583.71 144.50 24.6395 7.044e-06 ***
- I(drop_prob^2) 1 250.429 653.55 151.28 34.1675 2.857e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Step: AIC=121
Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 - l2_wd)^2) +
lr + rnn_layers_enc
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 369.11 121.00
+ I(lr^2) 1 11.944 357.16 121.03 1.7723 0.188790
+ rnn_layers_mod 1 10.970 358.14 121.19 1.6234 0.208174
+ GRU 1 3.388 365.72 122.45 0.4910 0.486535
+ I(log(1 - l2_wd)) 1 3.023 366.08 122.51 0.4377 0.511116
+ I(rnn_layers_mod2 * self_att) 1 2.274 366.83 122.63 0.3286 0.568935
+ I(drop_prob) 1 1.181 367.93 122.81 0.1701 0.681689
- rnn_layers_enc 1 34.014 403.12 124.29 4.9762 0.029873 *
- char_embeddings 1 49.853 418.96 126.61 7.2934 0.009225 **
- lr 1 49.918 419.02 126.61 7.3030 0.009182 **
- I(log(1 - l2_wd)^2) 1 204.836 573.94 145.49 29.9674 1.170e-06 ***
- I(drop_prob^2) 1 226.496 595.60 147.71 33.1362 4.179e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Call:
lm(formula = Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 -
l2_wd)^2) + lr + rnn_layers_enc, data = df)
Coefficients:
(Intercept) char_embeddingsTRUE I(drop_prob^2) I(log(1 - l2_wd)^2)
6.768e+01 1.882e+00 -4.429e+01 -4.343e+05
lr rnn_layers_enc
-3.440e+00 -1.578e+00
As shown by the results, the most significant predictors of F1 are associated with the learning rate, L2 weight decay squared, the drop probability squared, the use of character embeddings, and to a lesser degree by the number of encoder RNN layers.
df %>% arrange(-Dev_F1) %>% top_n(10,Dev_F1)
fit = lm(formula = Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 -
l2_wd)^2) + lr + rnn_layers_enc, data = df)
summary(fit)
Call:
lm(formula = Dev_F1 ~ char_embeddings + I(drop_prob^2) + I(log(1 -
l2_wd)^2) + lr + rnn_layers_enc, data = df)
Residuals:
Min 1Q Median 3Q Max
-7.3423 -1.5781 0.3982 1.4679 5.0064
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.768e+01 1.513e+00 44.734 < 2e-16 ***
char_embeddingsTRUE 1.882e+00 6.969e-01 2.701 0.00922 **
I(drop_prob^2) -4.429e+01 7.694e+00 -5.756 4.18e-07 ***
I(log(1 - l2_wd)^2) -4.343e+05 7.934e+04 -5.474 1.17e-06 ***
lr -3.440e+00 1.273e+00 -2.702 0.00918 **
rnn_layers_enc -1.578e+00 7.074e-01 -2.231 0.02987 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.614 on 54 degrees of freedom
Multiple R-squared: 0.6289, Adjusted R-squared: 0.5945
F-statistic: 18.3 on 5 and 54 DF, p-value: 1.386e-10
plot(fit)