source("Pre-Processing.R")
source("Initial Models.R")
source("Each Variable Models.R")
cores = detectCores()
It’s a tricky and interesting exercise in parallization methods to build simultaneously running models most effeciently that use the same series of training and test sets. It turns out, the easiest / computationally cheapest way to do it is to:
model, statistic, and value. This result can be subsetted and have averages taken really conveniently. It populates the visuals below.run_models_each_variable(5, "2009-06-30", 100)
run_models_each_variable(10, "2009-06-30", 100)
Including the two “toy” examples above which, frankly, resemble Haldane’s charts more than any other simulations I’ve run. This training set size is actually a good analog for Haldane’s charts, which use a training set of n = 100 for a popluation of 4000. It’s a little naive to do IMO.
Here’s a larger # of simulations and sample size = 500:
run_models_each_variable(100, "2009-06-30", 500)
run_models_each_variable(100, "2009-06-30", 1000)
With a larger number of simulations, the models begin to display what I originally expected: that (1) the out-of-sample full model, given a high enough training set size / number of trials, outperforms the individual models, and (2) that the out-of-sample error converges to in-sample error as the paramaters increase as well.
I ran exactly one trial of n = 1000 outside of this notebook, but I couldn’t paste the results here. At n > 500, the results above don’t change very much at all.
This is to test our suspicions that trials become more accurate if we look at a snapshot of variables of banks further along into the future:
run_models(100, "2007-12-31", 500, "logreg")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05988000 0.06143276 0.24367938 0.24782893 0.79321851 0.77510884
run_models(100, "2007-12-31", 500, "lasso")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.0595800 0.0607876 0.2431528 0.2465367 0.7751210 0.7521541
run_models(100, "2007-12-31", 500, "ridge")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05964000 0.06065075 0.24326481 0.24625920 0.76571632 0.74345613
run_models(100, "2007-12-31", 500, "elastic_net", .2)
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05986000 0.06079179 0.24363308 0.24654302 0.76938985 0.74669381
run_models(100, "2008-06-30", 500, "logreg")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05370000 0.05927019 0.23082066 0.24342706 0.85102721 0.83074214
run_models(100, "2008-06-30", 500, "lasso")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05464000 0.05973038 0.23282764 0.24438512 0.83875202 0.82408068
run_models(100, "2008-06-30", 500, "ridge")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05428000 0.05918831 0.23212989 0.24326930 0.84448234 0.82549120
run_models(100, "2008-06-30", 500, "elastic_net", .2)
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05370000 0.05957228 0.23094900 0.24405477 0.83574990 0.81645338
run_models(100, "2008-12-31", 500, "logreg")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.04758000 0.04997121 0.21728776 0.22349931 0.91140927 0.90130256
run_models(100, "2008-12-31", 500, "lasso")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.05216000 0.05404203 0.22749250 0.23244157 0.90784180 0.89680104
run_models(100, "2008-12-31", 500, "ridge")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.04940000 0.05122499 0.22140509 0.22626949 0.91362970 0.90276079
run_models(100, "2008-12-31", 500, "elastic_net", .2)
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.04944000 0.05150137 0.22145863 0.22687640 0.91483492 0.90163495
run_models(100, "2009-06-30", 500, "logreg")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.03370000 0.04223308 0.18207327 0.20542223 0.92982727 0.90754315
run_models(100, "2009-06-30", 500, "lasso")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.04052000 0.04688594 0.20037015 0.21646890 0.91854289 0.90321161
run_models(100, "2009-06-30", 500, "ridge")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.03566000 0.04288215 0.18729524 0.20697075 0.92787092 0.91127114
run_models(100, "2009-06-30", 500, "elastic_net", .2)
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.03524000 0.04335764 0.18630001 0.20809903 0.92934812 0.90848856
run_models(100, "2009-12-31", 500, "logreg")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.02790000 0.03111128 0.16566933 0.17633098 0.94888808 0.93243795
run_models(100, "2009-12-31", 500, "lasso")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.03560000 0.03671235 0.18770336 0.19150699 0.94246103 0.92961871
run_models(100, "2009-12-31", 500, "ridge")
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.02874000 0.03141814 0.16804757 0.17716310 0.95168336 0.93588930
run_models(100, "2009-12-31", 500, "elastic_net", .2)
in_sample_mse out_of_sample_mse in_sample_rmse out_of_sample_rmse in_sample_auc out_of_sample_auc
0.02944000 0.03214509 0.17022538 0.17919218 0.95149641 0.93461904