Answer:
For the multi-regression example, let’s look at a comparison of all the variables:
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
drops <- c("Tm")
ggpairs(player_df[, !(names(player_df) %in% drops)], lower = list(combo = wrap("facethist", bins = 10)))

Now, using the deployment and positional statistics in this data, we can create a first attempt at a linear regression. Our dichotomous variable is the player’s position, our dichomotomous/continous interation term is position vs. average time on ice per game, and we have included one quadratic term for each variable.
hockey_lm <- lm(data = train_df, formula = CF. ~ Pos * TOI.EV. + Tm + I(TOI.EV.^2) + Age + I(Age^2) + oiSH. + I(oiSH.^2)+ oiSV. + I(oiSV.^2) + PDO + I(PDO^2) + oZS. + I(oZS.^2) + dZS. + I(dZS.^2) + TOI.60 + I(TOI.60^2) + Thru. + I(Thru.^2))
summary(hockey_lm)
##
## Call:
## lm(formula = CF. ~ Pos * TOI.EV. + Tm + I(TOI.EV.^2) + Age +
## I(Age^2) + oiSH. + I(oiSH.^2) + oiSV. + I(oiSV.^2) + PDO +
## I(PDO^2) + oZS. + I(oZS.^2) + dZS. + I(dZS.^2) + TOI.60 +
## I(TOI.60^2) + Thru. + I(Thru.^2), data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4683 -1.4121 0.0365 1.4424 6.4856
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.453e+02 2.466e+03 -0.262 0.793696
## PosF -4.593e-01 2.601e+00 -0.177 0.859888
## TOI.EV. 1.297e+00 9.789e-01 1.325 0.185600
## TmARI -9.452e-01 9.105e-01 -1.038 0.299645
## TmBOS 3.891e+00 8.618e-01 4.515 7.72e-06 ***
## TmBUF -2.141e+00 8.786e-01 -2.437 0.015119 *
## TmCAR 3.526e+00 8.817e-01 3.999 7.20e-05 ***
## TmCBJ 1.301e+00 8.398e-01 1.549 0.121933
## TmCGY 3.459e+00 8.450e-01 4.094 4.86e-05 ***
## TmCHI 2.930e+00 8.568e-01 3.420 0.000672 ***
## TmCOL -7.952e-01 8.476e-01 -0.938 0.348547
## TmDAL 3.047e+00 8.820e-01 3.454 0.000593 ***
## TmDET -7.063e-01 8.447e-01 -0.836 0.403446
## TmEDM 1.173e+00 8.646e-01 1.357 0.175261
## TmFLA 8.394e-01 8.829e-01 0.951 0.342173
## TmLAK 4.908e-01 8.414e-01 0.583 0.559871
## TmMIN -1.111e+00 8.687e-01 -1.279 0.201345
## TmMTL 4.727e-01 8.516e-01 0.555 0.579072
## TmNJD -3.163e-01 8.513e-01 -0.372 0.710401
## TmNSH 2.279e+00 8.563e-01 2.661 0.008017 **
## TmNYI -1.547e+00 8.520e-01 -1.816 0.069979 .
## TmNYR -3.360e+00 8.657e-01 -3.881 0.000116 ***
## TmOTT -1.805e+00 9.348e-01 -1.931 0.054039 .
## TmPHI 5.600e-01 8.433e-01 0.664 0.506887
## TmPIT 2.267e+00 8.741e-01 2.594 0.009740 **
## TmSJS 1.652e+00 8.432e-01 1.959 0.050561 .
## TmSTL 2.609e+00 8.768e-01 2.976 0.003051 **
## TmTBL 2.724e+00 8.568e-01 3.179 0.001559 **
## TmTOR 2.432e+00 8.401e-01 2.895 0.003936 **
## TmTOT 1.434e-01 7.244e-01 0.198 0.843123
## TmVAN -2.449e-01 8.818e-01 -0.278 0.781347
## TmVEG 1.699e+00 8.657e-01 1.963 0.050132 .
## TmWPG 2.737e+00 8.287e-01 3.302 0.001020 **
## TmWSH -1.114e+00 8.633e-01 -1.291 0.197341
## I(TOI.EV.^2) -4.021e-02 3.184e-02 -1.263 0.207092
## Age 6.249e-02 2.400e-01 0.260 0.794637
## I(Age^2) -2.360e-03 4.232e-03 -0.558 0.577345
## oiSH. 1.435e-01 2.183e+00 0.066 0.947605
## I(oiSH.^2) -1.989e-02 2.999e-02 -0.663 0.507529
## oiSV. -1.595e+00 6.580e+00 -0.242 0.808618
## I(oiSV.^2) 6.950e-03 3.396e-02 0.205 0.837936
## PDO -3.940e+00 4.261e+00 -0.925 0.355471
## I(PDO^2) 1.980e-02 1.827e-02 1.084 0.278835
## oZS. 9.668e+00 2.462e+01 0.393 0.694715
## I(oZS.^2) -6.675e-04 9.575e-04 -0.697 0.485997
## dZS. 9.464e+00 2.462e+01 0.384 0.700855
## I(dZS.^2) NA NA NA NA
## TOI.60 -2.083e-01 4.918e-01 -0.423 0.672100
## I(TOI.60^2) 9.826e-03 1.379e-02 0.712 0.476455
## Thru. 5.121e-01 1.559e-01 3.285 0.001083 **
## I(Thru.^2) -4.837e-03 1.413e-03 -3.423 0.000664 ***
## PosF:TOI.EV. 1.318e-01 1.805e-01 0.730 0.465699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.354 on 562 degrees of freedom
## Multiple R-squared: 0.612, Adjusted R-squared: 0.5775
## F-statistic: 17.73 on 50 and 562 DF, p-value: < 2.2e-16
So, we can start eliminating variables by removing the one with the highest p-value:
hockey_sv_tmp_lm <- update(hockey_lm, .~. - I(TOI.60^2))
summary(hockey_sv_tmp_lm)
##
## Call:
## lm(formula = CF. ~ Pos + TOI.EV. + Tm + I(TOI.EV.^2) + Age +
## I(Age^2) + oiSH. + I(oiSH.^2) + oiSV. + I(oiSV.^2) + PDO +
## I(PDO^2) + oZS. + I(oZS.^2) + dZS. + I(dZS.^2) + TOI.60 +
## Thru. + I(Thru.^2) + Pos:TOI.EV., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.419 -1.446 -0.003 1.449 6.488
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.832e+02 2.464e+03 -0.237 0.812961
## PosF -7.708e-01 2.563e+00 -0.301 0.763717
## TOI.EV. 7.454e-01 5.981e-01 1.246 0.213180
## TmARI -9.468e-01 9.101e-01 -1.040 0.298636
## TmBOS 3.888e+00 8.615e-01 4.513 7.78e-06 ***
## TmBUF -2.126e+00 8.780e-01 -2.422 0.015753 *
## TmCAR 3.490e+00 8.799e-01 3.967 8.23e-05 ***
## TmCBJ 1.273e+00 8.386e-01 1.518 0.129453
## TmCGY 3.449e+00 8.445e-01 4.085 5.05e-05 ***
## TmCHI 2.911e+00 8.560e-01 3.400 0.000721 ***
## TmCOL -8.166e-01 8.467e-01 -0.964 0.335244
## TmDAL 3.067e+00 8.811e-01 3.480 0.000539 ***
## TmDET -7.584e-01 8.412e-01 -0.902 0.367645
## TmEDM 1.149e+00 8.635e-01 1.331 0.183708
## TmFLA 8.485e-01 8.824e-01 0.962 0.336677
## TmLAK 5.220e-01 8.399e-01 0.622 0.534479
## TmMIN -1.060e+00 8.653e-01 -1.225 0.221061
## TmMTL 4.850e-01 8.510e-01 0.570 0.568979
## TmNJD -3.372e-01 8.505e-01 -0.397 0.691854
## TmNSH 2.314e+00 8.545e-01 2.708 0.006980 **
## TmNYI -1.544e+00 8.517e-01 -1.813 0.070330 .
## TmNYR -3.371e+00 8.652e-01 -3.896 0.000109 ***
## TmOTT -1.829e+00 9.338e-01 -1.959 0.050631 .
## TmPHI 5.501e-01 8.428e-01 0.653 0.514184
## TmPIT 2.231e+00 8.722e-01 2.558 0.010802 *
## TmSJS 1.650e+00 8.428e-01 1.958 0.050701 .
## TmSTL 2.611e+00 8.764e-01 2.979 0.003015 **
## TmTBL 2.731e+00 8.564e-01 3.189 0.001505 **
## TmTOR 2.413e+00 8.393e-01 2.874 0.004200 **
## TmTOT 1.363e-01 7.241e-01 0.188 0.850716
## TmVAN -2.279e-01 8.811e-01 -0.259 0.796018
## TmVEG 1.678e+00 8.648e-01 1.941 0.052768 .
## TmWPG 2.707e+00 8.273e-01 3.272 0.001134 **
## TmWSH -1.116e+00 8.629e-01 -1.293 0.196607
## I(TOI.EV.^2) -2.158e-02 1.815e-02 -1.189 0.234950
## Age 6.625e-02 2.398e-01 0.276 0.782439
## I(Age^2) -2.443e-03 4.228e-03 -0.578 0.563591
## oiSH. 8.368e-02 2.181e+00 0.038 0.969407
## I(oiSH.^2) -2.040e-02 2.997e-02 -0.681 0.496262
## oiSV. -1.725e+00 6.575e+00 -0.262 0.793189
## I(oiSV.^2) 7.300e-03 3.395e-02 0.215 0.829813
## PDO -3.851e+00 4.257e+00 -0.905 0.366095
## I(PDO^2) 1.969e-02 1.826e-02 1.078 0.281406
## oZS. 9.080e+00 2.460e+01 0.369 0.712163
## I(oZS.^2) -7.486e-04 9.503e-04 -0.788 0.431195
## dZS. 8.867e+00 2.460e+01 0.360 0.718631
## I(dZS.^2) NA NA NA NA
## TOI.60 1.312e-01 1.221e-01 1.074 0.283146
## Thru. 5.209e-01 1.553e-01 3.354 0.000849 ***
## I(Thru.^2) -4.915e-03 1.408e-03 -3.490 0.000521 ***
## PosF:TOI.EV. 1.529e-01 1.780e-01 0.859 0.390600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.353 on 563 degrees of freedom
## Multiple R-squared: 0.6117, Adjusted R-squared: 0.5779
## F-statistic: 18.1 on 49 and 563 DF, p-value: < 2.2e-16
Looking at the Adjusted R-squared value, we are doing a little better. Fortunately, there is a step() function in R which performs backward elimation for us.
step_lm <- step(hockey_lm, trace = FALSE)
summary(step_lm)
##
## Call:
## lm(formula = CF. ~ Pos + TOI.EV. + Tm + I(TOI.EV.^2) + I(Age^2) +
## oiSV. + PDO + oZS. + Thru. + I(Thru.^2), data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6186 -1.4647 -0.0153 1.4065 6.3409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.2600620 8.3754607 6.001 3.49e-09 ***
## PosF 1.5215843 0.3756951 4.050 5.83e-05 ***
## TOI.EV. 1.3285344 0.2628316 5.055 5.81e-07 ***
## TmARI -1.0593959 0.9034013 -1.173 0.241414
## TmBOS 3.7071383 0.8437427 4.394 1.33e-05 ***
## TmBUF -2.1126077 0.8708676 -2.426 0.015580 *
## TmCAR 3.4188811 0.8696632 3.931 9.49e-05 ***
## TmCBJ 1.2124618 0.8298137 1.461 0.144530
## TmCGY 3.4730941 0.8366954 4.151 3.81e-05 ***
## TmCHI 2.8259918 0.8455499 3.342 0.000885 ***
## TmCOL -0.8032047 0.8340898 -0.963 0.335969
## TmDAL 3.1170512 0.8619819 3.616 0.000325 ***
## TmDET -0.8141039 0.8321727 -0.978 0.328346
## TmEDM 1.1217266 0.8544382 1.313 0.189769
## TmFLA 0.8077483 0.8754360 0.923 0.356562
## TmLAK 0.5591231 0.8324916 0.672 0.502093
## TmMIN -1.1206651 0.8586086 -1.305 0.192346
## TmMTL 0.5580427 0.8379135 0.666 0.505686
## TmNJD -0.3710155 0.8407814 -0.441 0.659181
## TmNSH 2.4085963 0.8400598 2.867 0.004294 **
## TmNYI -1.7341960 0.8372153 -2.071 0.038771 *
## TmNYR -3.3749125 0.8555693 -3.945 8.98e-05 ***
## TmOTT -1.8552669 0.9279591 -1.999 0.046049 *
## TmPHI 0.4964842 0.8365422 0.593 0.553084
## TmPIT 2.2220083 0.8610643 2.581 0.010113 *
## TmSJS 1.5480816 0.8336593 1.857 0.063829 .
## TmSTL 2.4916046 0.8671314 2.873 0.004212 **
## TmTBL 2.7891012 0.8461993 3.296 0.001041 **
## TmTOR 2.2972774 0.8334722 2.756 0.006033 **
## TmTOT 0.0692247 0.7165894 0.097 0.923076
## TmVAN -0.4161058 0.8518932 -0.488 0.625420
## TmVEG 1.6831341 0.8529434 1.973 0.048940 *
## TmWPG 2.7277174 0.8181945 3.334 0.000912 ***
## TmWSH -1.1280278 0.8553104 -1.319 0.187746
## I(TOI.EV.^2) -0.0331391 0.0093640 -3.539 0.000434 ***
## I(Age^2) -0.0011514 0.0004163 -2.766 0.005859 **
## oiSV. -0.2041614 0.1030203 -1.982 0.047985 *
## PDO -0.1450732 0.0729914 -1.988 0.047339 *
## oZS. 0.1404771 0.0129903 10.814 < 2e-16 ***
## Thru. 0.5008858 0.1519488 3.296 0.001040 **
## I(Thru.^2) -0.0047330 0.0013785 -3.433 0.000639 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.344 on 572 degrees of freedom
## Multiple R-squared: 0.6086, Adjusted R-squared: 0.5812
## F-statistic: 22.23 on 40 and 572 DF, p-value: < 2.2e-16
Okay, let’s check the residuals. Typically we would have to check that each variable consistent variability of residuals and is linearly related to the outcome. However, that is a more intensive task for the future. Right now, we can just check the base information.



Time to check our how good our model fits to the test data.
predicted_vals <- predict(step_lm, test_df)
delta <- predicted_vals - test_df$CF.
(tt <- t.test(delta, conf.level = 0.95))
##
## One Sample t-test
##
## data: delta
## t = 0.44111, df = 68, p-value = 0.6605
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4431341 0.6946499
## sample estimates:
## mean of x
## 0.1257579
The confidence interval ecompasses zero and has a range of [-0.4431341 - 0.6946499], which is fairly tight considering corsi for percentage ranges from [10 - 80].
Well, we did all the work, let’s have some fun looking at the results. The chart below shows us the comparison of the predicted corsi for percentage against a player’s actual corsi for percentage. The higher their placement above the red line, the more the player has outperformed the prediction.
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Finally, here are the top players based on the new difference:
Here is a table of the top players that have outperformed the prediction:
|
Player
|
Tm
|
GP
|
Pos
|
oiSH.
|
oiSV.
|
oZS.
|
TOI.EV.
|
TK
|
E…
|
Thru.
|
CF.
|
predicted
|
CFDiff
|
CF..rel
|
|
Cody Franson
|
CHI
|
23
|
D
|
4.5
|
93.1
|
65.8
|
14.383333
|
0.3043478
|
3.1
|
49.0
|
59.9
|
53.55913
|
6.340868
|
9.2
|
|
Eric Gryba
|
EDM
|
21
|
D
|
9.1
|
89.7
|
57.5
|
13.166667
|
0.4285714
|
0.1
|
33.3
|
55.2
|
49.00840
|
6.191597
|
1.7
|
|
Adam Lowry
|
WPG
|
44
|
F
|
8.2
|
91.1
|
40.9
|
12.066667
|
0.4545455
|
8.3
|
67.0
|
56.2
|
50.09117
|
6.108828
|
4.3
|
|
Brett Ritchie
|
DAL
|
69
|
F
|
6.6
|
93.2
|
50.1
|
9.333333
|
0.1304348
|
8.9
|
56.1
|
56.5
|
50.49871
|
6.001293
|
5.9
|
|
Frank Vatrano
|
TOT
|
39
|
F
|
6.7
|
92.8
|
58.9
|
10.033333
|
0.2564103
|
4.5
|
59.2
|
55.1
|
49.19363
|
5.906365
|
3.9
|
|
Jordan Eberle
|
NYI
|
80
|
F
|
9.9
|
89.7
|
52.2
|
14.016667
|
0.4625000
|
12.5
|
63.5
|
54.5
|
48.61109
|
5.888913
|
9.9
|
|
Ben Lovejoy
|
NJD
|
56
|
D
|
6.7
|
92.5
|
41.1
|
13.450000
|
0.3928571
|
4.7
|
52.9
|
51.8
|
46.25855
|
5.541448
|
3.7
|
|
Andrew Ladd
|
NYI
|
72
|
F
|
9.3
|
92.7
|
45.2
|
12.916667
|
0.4027778
|
-0.2
|
50.7
|
51.8
|
46.35496
|
5.445041
|
5.1
|
|
Richard Panik
|
TOT
|
71
|
F
|
8.4
|
92.7
|
53.9
|
13.083333
|
0.3661972
|
-0.4
|
50.6
|
55.2
|
50.00196
|
5.198040
|
5.5
|
|
Chris Kreider
|
NYR
|
57
|
F
|
8.2
|
91.9
|
51.4
|
13.000000
|
0.2982456
|
5.3
|
58.3
|
51.5
|
46.34994
|
5.150065
|
7.3
|
|
Gemel Smith
|
DAL
|
44
|
F
|
7.7
|
95.1
|
48.2
|
9.233333
|
0.2727273
|
5.8
|
65.1
|
53.7
|
48.72221
|
4.977787
|
2.6
|
|
Nino Niederreiter
|
MIN
|
62
|
F
|
10.0
|
92.9
|
51.0
|
12.683333
|
0.3709677
|
4.4
|
60.7
|
52.6
|
47.71006
|
4.889936
|
6.6
|
|
Nick Seeler
|
MIN
|
21
|
D
|
9.8
|
95.1
|
48.3
|
13.583333
|
0.0952381
|
4.5
|
40.0
|
49.9
|
45.02155
|
4.878449
|
0.0
|
|
Will Butcher
|
NJD
|
80
|
D
|
8.2
|
91.0
|
62.0
|
13.200000
|
0.2500000
|
11.2
|
37.0
|
53.7
|
48.83545
|
4.864547
|
7.0
|
|
Jussi Jokinen
|
TOT
|
59
|
F
|
6.4
|
93.8
|
46.6
|
10.016667
|
0.3050847
|
-0.3
|
52.6
|
51.2
|
46.61358
|
4.586417
|
-0.1
|
|
Mathieu Perreault
|
WPG
|
69
|
F
|
7.5
|
91.9
|
52.9
|
12.183333
|
0.5797101
|
9.9
|
59.8
|
56.6
|
52.01639
|
4.583612
|
6.2
|
|
Tim Heed
|
SJS
|
29
|
D
|
7.6
|
90.0
|
63.0
|
12.966667
|
0.5172414
|
5.6
|
40.2
|
56.0
|
51.42697
|
4.573026
|
4.3
|
|
Travis Dermott
|
TOR
|
36
|
D
|
9.1
|
95.4
|
51.1
|
15.400000
|
0.2500000
|
6.8
|
52.0
|
54.8
|
50.43912
|
4.360883
|
6.1
|
|
Tucker Poolman
|
WPG
|
23
|
D
|
6.1
|
95.1
|
52.0
|
12.283333
|
0.2173913
|
2.1
|
43.5
|
54.0
|
49.66912
|
4.330884
|
2.8
|
|
Craig Smith
|
NSH
|
78
|
F
|
8.4
|
93.7
|
60.8
|
13.066667
|
0.7564103
|
14.8
|
61.5
|
56.8
|
52.49133
|
4.308671
|
8.3
|