R Markdown

This is a R Markdown document to predict the salary of National Hockey League players. For more details on using R Markdown see http://rmarkdown.rstudio.com. This code islinear regression model implementation on NHL data set available on kaggle. I am thank full to kernel avilable at https://www.kaggle.com/camnugent/nhl-salary-data-prediction-cleaning-and-modeling prepared by Cam Nuget.
NOTE: Download the train and test data from the above link.

# install.packages('plyr') install.packages('stringr')
# install.packages('magrittr')
# install.packages('scatterplot3d')

library(tidyverse)
library(plyr)
library(magrittr)
library(stringr)

library(scatterplot3d)

Set working directory in R

setwd("D:/2018/Upwork/JB125")

Load the NHL data

Loading the data requires calling suitable R functions. The function depends on the type of file. For example, our dataset is in comma separated foramt i.e. csv; therefore, we can use R function “read.csv()”" as demonstrated below.

train.df <- read.csv("./train.csv", header = TRUE)
colnames(train.df)  # Check column names
##   [1] "Salary"     "Born"       "City"       "Pr.St"      "Cntry"      "Nat"        "Ht"         "Wt"         "DftYr"      "DftRd"      "Ovrl"       "Hand"       "Last.Name"  "First.Name" "Position"  
##  [16] "Team"       "GP"         "G"          "A"          "A1"         "A2"         "PTS"        "X..."       "E..."       "PIM"        "Shifts"     "TOI"        "TOIX"       "TOI.GP"     "TOI.GP.1"  
##  [31] "TOI."       "IPP."       "SH."        "SV."        "PDO"        "F.60"       "A.60"       "Pct."       "Diff"       "Diff.60"    "iCF"        "iCF.1"      "iFF"        "iSF"        "iSF.1"     
##  [46] "iSF.2"      "ixG"        "iSCF"       "iRB"        "iRS"        "iDS"        "sDist"      "sDist.1"    "Pass"       "iHF"        "iHF.1"      "iHA"        "iHDf"       "iMiss"      "iGVA"      
##  [61] "iTKA"       "iBLK"       "iGVA.1"     "iTKA.1"     "iBLK.1"     "BLK."       "iFOW"       "iFOL"       "iFOW.1"     "iFOL.1"     "FO."        "X.FOT"      "dzFOW"      "dzFOL"      "nzFOW"     
##  [76] "nzFOL"      "ozFOW"      "ozFOL"      "FOW.Up"     "FOL.Up"     "FOW.Down"   "FOL.Down"   "FOW.Close"  "FOL.Close"  "OTG"        "X1G"        "GWG"        "ENG"        "PSG"        "PSA"       
##  [91] "G.Bkhd"     "G.Dflct"    "G.Slap"     "G.Snap"     "G.Tip"      "G.Wrap"     "G.Wrst"     "CBar"       "Post"       "Over"       "Wide"       "S.Bkhd"     "S.Dflct"    "S.Slap"     "S.Snap"    
## [106] "S.Tip"      "S.Wrap"     "S.Wrst"     "iPenT"      "iPenD"      "iPENT"      "iPEND"      "iPenDf"     "NPD"        "Min"        "Maj"        "Match"      "Misc"       "Game"       "CF"        
## [121] "CA"         "FF"         "FA"         "SF"         "SA"         "xGF"        "xGA"        "SCF"        "SCA"        "GF"         "GA"         "RBF"        "RBA"        "RSF"        "RSA"       
## [136] "DSF"        "DSA"        "FOW"        "FOL"        "HF"         "HA"         "GVA"        "TKA"        "PENT"       "PEND"       "OPS"        "DPS"        "PS"         "OTOI"       "Grit"      
## [151] "DAP"        "Pace"       "GS"         "GS.G"
head(train.df, 5)
##    Salary     Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.
## 1  925000 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0
## 2 2250000 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4
## 3 8000000 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7
## 4 3500000 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2
## 5 1750000 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8
##     SV.  PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1
## 1 0.750  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0
## 2 0.915  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1
## 3 0.934 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54
## 4 0.897  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0
## 5 0.917  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3
##   iFOL.1   FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over
## 1      0   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0
## 2      0 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2
## 3     45  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4
## 4      0   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0
## 5      7  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1
##   Wide S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW
## 1    1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4
## 2   51      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949
## 3   51     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739
## 4   15      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324
## 5   20      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528
##   FOL  HF  HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G
## 1   5   1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38
## 2 939 749 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18
## 3 600 340 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57
## 4 328 198 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20
## 5 490 512 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27
test.df <- read.csv("./test.csv", header = TRUE)
colnames(test.df)  # Check column names
##   [1] "Born"       "City"       "Pr.St"      "Cntry"      "Nat"        "Ht"         "Wt"         "DftYr"      "DftRd"      "Ovrl"       "Hand"       "Last.Name"  "First.Name" "Position"   "Team"      
##  [16] "GP"         "G"          "A"          "A1"         "A2"         "PTS"        "X..."       "E..."       "PIM"        "Shifts"     "TOI"        "TOIX"       "TOI.GP"     "TOI.GP.1"   "TOI."      
##  [31] "IPP."       "SH."        "SV."        "PDO"        "F.60"       "A.60"       "Pct."       "Diff"       "Diff.60"    "iCF"        "iCF.1"      "iFF"        "iSF"        "iSF.1"      "iSF.2"     
##  [46] "ixG"        "iSCF"       "iRB"        "iRS"        "iDS"        "sDist"      "sDist.1"    "Pass"       "iHF"        "iHF.1"      "iHA"        "iHDf"       "iMiss"      "iGVA"       "iTKA"      
##  [61] "iBLK"       "iGVA.1"     "iTKA.1"     "iBLK.1"     "BLK."       "iFOW"       "iFOL"       "iFOW.1"     "iFOL.1"     "FO."        "X.FOT"      "dzFOW"      "dzFOL"      "nzFOW"      "nzFOL"     
##  [76] "ozFOW"      "ozFOL"      "FOW.Up"     "FOL.Up"     "FOW.Down"   "FOL.Down"   "FOW.Close"  "FOL.Close"  "OTG"        "X1G"        "GWG"        "ENG"        "PSG"        "PSA"        "G.Bkhd"    
##  [91] "G.Dflct"    "G.Slap"     "G.Snap"     "G.Tip"      "G.Wrap"     "G.Wrst"     "CBar"       "Post"       "Over"       "Wide"       "S.Bkhd"     "S.Dflct"    "S.Slap"     "S.Snap"     "S.Tip"     
## [106] "S.Wrap"     "S.Wrst"     "iPenT"      "iPenD"      "iPENT"      "iPEND"      "iPenDf"     "NPD"        "Min"        "Maj"        "Match"      "Misc"       "Game"       "CF"         "CA"        
## [121] "FF"         "FA"         "SF"         "SA"         "xGF"        "xGA"        "SCF"        "SCA"        "GF"         "GA"         "RBF"        "RBA"        "RSF"        "RSA"        "DSF"       
## [136] "DSA"        "FOW"        "FOL"        "HF"         "HA"         "GVA"        "TKA"        "PENT"       "PEND"       "OPS"        "DPS"        "PS"         "OTOI"       "Grit"       "DAP"       
## [151] "Pace"       "GS"         "GS.G"
head(test.df, 5)
##       Born          City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X... E... PIM Shifts   TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.  PDO
## 1 88-11-05        Ithaca    NY   USA USA 72 216  2003     1   13    R      Brown     Dustin    RW/LW  L.A 80 14 22  9 13  36   -4  8.2  22   1729 76801 1278.5  16.00    15.99 27.2 65.5 8.5 0.898  982
## 2 00-02-29        Prague         CZE CZE 72 195  2014     1   13    L      Vrana      Jakub       LW  WSH 21  3  3  2  1   6    2  0.4   2    291 13997  233.2  11.12    11.11 22.0 66.7 6.7 0.969 1037
## 3 92-04-24     St. Louis    MO   USA USA 75 227  2007     6  161    L     Maroon    Patrick       LW  EDM 81 27 15  9  6  42   13 15.0  95   1715 81345 1351.9  16.73    16.72 30.0 54.5 9.5 0.919 1014
## 4 99-07-05       Piikkio         FIN FIN 72 182  2013     2   55    L   Lehkonen    Artturi       LW  MTL 73 18 10  3  7  28   -1  3.0   8   1488 60702 1010.0  13.87    13.85 24.9 63.6 8.5 0.922 1007
## 5 96-10-27 Niagara Falls    NY   USA USA 72 196  2011     2   36    R Clendening       Adam        D  NYR 31  2  9  2  7  11    3  3.6  17    658 29406  490.2  15.82    15.81 28.7 47.8 9.3 0.909 1002
##   F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1 FO.
## 1 2.58 2.77 48.2   -4   -0.19 326   326 251 175   175   175 19.7   73  19  19  38  28.2    27.0 198.5 190   190 151   39    76   27   25   31     27     25     31  2.8    2    3      2      3  40
## 2 2.32 0.77 75.0    6    1.54  56    56  49  32    32    32  5.2   23   3   2   5  21.0    23.2  57.7   6     6  11   -5    17    9    4    7      9      4      7  3.9    0    3      0      3   0
## 3 3.42 2.09 62.1   30    1.33 300   300 243 178   178   178 25.1  109  23  17  40  21.8    21.7 154.6 189   189 109   80    65   49   33   23     49     33     23  2.1    6    9      6      9  40
## 4 2.61 2.26 53.7    6    0.36 279   279 208 158   158   158 17.0   58  13  20  33  28.1    26.1  81.4  72    72 113  -41    50   20   25   25     20     25     25  2.9    2    2      2      2  50
## 5 2.82 2.08 57.5    6    0.73  89    89  58  39    38    38  1.7    1   0   2   2  59.1    41.7  24.2  22    22  66  -44    19   20    7   27     20      7     27  7.6    0    0      0      0   0
##   X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide S.Bkhd
## 1   0.4     2     2     0     0     0     1      2      2        0        1         0         2   0   2   1   0   0   0      1       0      0      4     2      0      7    0    2    3   71     15
## 2   1.7     0     0     0     0     0     3      0      2        0        0         0         1   0   2   2   0   0   0      0       0      0      1     2      0      0    0    1    2   14      3
## 3   1.2     0     0     0     1     6     8      4      3        1        2         3         6   0   8   5   0   0   0      2       1      0      1     2      1     20    0    5    3   57     17
## 4   0.4     0     1     1     0     1     1      1      1        0        0         1         2   0   2   3   0   0   0      3       1      4      2     0      1      7    0    2    3   45     16
## 5   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      1      1     0      0      0    0    0    0   19      1
##   S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF  FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF  HA GVA
## 1       1      5     39    13      2    100    11    25    10    19     14  11.3  11   0     0    0    0 1239 1114  927 821 649 577 60.8 52.4 203 184 55 59  62  39  57  73 119 112 639 652 651 542 194
## 2       0      4      2     7      0     16     1     2     1     1      1   0.5   1   0     0    0    0  282  179  201 132 134  98 14.2  8.0  53  24  9  3  11   8  13  15  24  23  91  85  75  83  42
## 3       2      5     36    16      6     96    34    19    34    19    -15 -13.0  25   9     0    0    0 1500 1091 1116 791 809 581 82.0 53.2 322 186 77 47  55  31  65  77 120 108 555 661 511 486 298
## 4       3     10     17     6      9     97     4     8     4     8      4   3.8   4   0     0    0    0 1006  877  714 655 518 489 47.2 39.0 163 140 44 38  34  29  72  49 106  78 521 461 399 499 160
## 5       0      7     15     0      0     15     7     4     7     4     -3  -0.2   6   1     0    0    0  508  353  372 267 247 187 24.2 16.7  77  44 23 17  14   8  19  21  33  29 239 227 175 211  87
##   TKA PENT PEND OPS DPS  PS    OTOI Grit  DAP  Pace   GS GS.G
## 1  83   72   65 1.9 1.8 3.7 3418.06  243 19.5 110.4 47.3 0.59
## 2  26   16   10 0.3 0.3 0.6  826.49   15 10.0 118.6  8.9 0.42
## 3 184   77   86 3.9 2.0 6.0 3155.13  316  8.9 115.0 52.5 0.65
## 4  95   44   40 2.3 1.1 3.4 3044.03  105 24.3 111.9 38.1 0.52
## 5  60   30   25 0.8 1.1 1.9 1217.17   67  4.8 105.4 15.9 0.51

Data Cleaning

Before cleaning data let us combine the two set of data as most of the features are same, with exception of “Salary” in the train dataset. “Salary” is the predictor variable (dependent). We will make a model to predict salary of players.

Let us check number of columns in each dataset.

dim.data.frame(train.df)
## [1] 612 154
dim.data.frame(test.df)
## [1] 262 153

Combine train and test datasets.

# test_x$TrainTest <- 'test' train_y$TrainTest <- 'train'

train_data <- train.df[, -c(1)]
dim(train_data)
## [1] 612 153
test_data <- test.df
dim(test_data)
## [1] 262 153
# test <-cbind(test_y, test_x)
all_data <- rbind(train_data, test_data)

####### 
colnames(all_data)
##   [1] "Born"       "City"       "Pr.St"      "Cntry"      "Nat"        "Ht"         "Wt"         "DftYr"      "DftRd"      "Ovrl"       "Hand"       "Last.Name"  "First.Name" "Position"   "Team"      
##  [16] "GP"         "G"          "A"          "A1"         "A2"         "PTS"        "X..."       "E..."       "PIM"        "Shifts"     "TOI"        "TOIX"       "TOI.GP"     "TOI.GP.1"   "TOI."      
##  [31] "IPP."       "SH."        "SV."        "PDO"        "F.60"       "A.60"       "Pct."       "Diff"       "Diff.60"    "iCF"        "iCF.1"      "iFF"        "iSF"        "iSF.1"      "iSF.2"     
##  [46] "ixG"        "iSCF"       "iRB"        "iRS"        "iDS"        "sDist"      "sDist.1"    "Pass"       "iHF"        "iHF.1"      "iHA"        "iHDf"       "iMiss"      "iGVA"       "iTKA"      
##  [61] "iBLK"       "iGVA.1"     "iTKA.1"     "iBLK.1"     "BLK."       "iFOW"       "iFOL"       "iFOW.1"     "iFOL.1"     "FO."        "X.FOT"      "dzFOW"      "dzFOL"      "nzFOW"      "nzFOL"     
##  [76] "ozFOW"      "ozFOL"      "FOW.Up"     "FOL.Up"     "FOW.Down"   "FOL.Down"   "FOW.Close"  "FOL.Close"  "OTG"        "X1G"        "GWG"        "ENG"        "PSG"        "PSA"        "G.Bkhd"    
##  [91] "G.Dflct"    "G.Slap"     "G.Snap"     "G.Tip"      "G.Wrap"     "G.Wrst"     "CBar"       "Post"       "Over"       "Wide"       "S.Bkhd"     "S.Dflct"    "S.Slap"     "S.Snap"     "S.Tip"     
## [106] "S.Wrap"     "S.Wrst"     "iPenT"      "iPenD"      "iPENT"      "iPEND"      "iPenDf"     "NPD"        "Min"        "Maj"        "Match"      "Misc"       "Game"       "CF"         "CA"        
## [121] "FF"         "FA"         "SF"         "SA"         "xGF"        "xGA"        "SCF"        "SCA"        "GF"         "GA"         "RBF"        "RBA"        "RSF"        "RSA"        "DSF"       
## [136] "DSA"        "FOW"        "FOL"        "HF"         "HA"         "GVA"        "TKA"        "PENT"       "PEND"       "OPS"        "DPS"        "PS"         "OTOI"       "Grit"       "DAP"       
## [151] "Pace"       "GS"         "GS.G"
dim(all_data)
## [1] 874 153

Check the columns with missing values.

all_missing_list = colnames(all_data)[colSums(is.na(all_data)) > 
    0]
print(all_missing_list)
##  [1] "DftYr"   "DftRd"   "Ovrl"    "TOIX"    "TOI."    "IPP."    "SH."     "SV."     "PDO"     "F.60"    "A.60"    "Diff.60" "iCF"     "iFF"     "iSF"     "ixG"     "iSCF"    "iRB"     "iRS"    
## [20] "iDS"     "sDist.1" "Pass"    "iHF.1"   "iHA"     "iHDf"    "iGVA.1"  "iTKA.1"  "iBLK.1"  "BLK."    "iFOW.1"  "iFOL.1"  "X.FOT"   "iPENT"   "iPEND"   "CF"      "CA"      "FF"      "FA"     
## [39] "SF"      "SA"      "xGF"     "xGA"     "SCF"     "SCA"     "GF"      "GA"      "RBF"     "RBA"     "RSF"     "RSA"     "FOW"     "FOL"     "HF"      "HA"      "GVA"     "TKA"     "PENT"   
## [58] "PEND"    "OTOI"    "Pace"    "GS"      "GS.G"

Imputation

State

library(plyr)
# fill the Pr.St column with 'INT' for international players
all_data$Pr.St = mapvalues(all_data$Pr.St, from = "", to = "INT")

Team

Team column states which team a player plyed for. Some players have multiple teams they played for. We will split each time into its own boolean predictor and those who player for multiple teams are recorded accordingly.

# Make team boolean columns get the unique list of team
# acronymns
teams = c()  # A list
for (i in levels(all_data$Team)) {
    x = strsplit(i, "/")  # Split the string and store the values as list in 'x'
    # print(x)
    for (y in x) {
        teams = c(teams, y)  # Combine all the values of x in a list 'teams'
        # print(y) print(teams)
        
    }
}
teams = unique(teams)  # assign unique entires to list teams 
print(teams)
##  [1] "ANA" "FLA" "N.J" "VAN" "ARI" "CGY" "MIN" "NYR" "TOR" "BOS" "WPG" "BUF" "CAR" "OTT" "PIT" "STL" "CBJ" "DAL" "CHI" "COL" "MTL" "NSH" "S.J" "DET" "EDM" "L.A" "T.B" "NYI" "PHI" "WSH"
# add columns with the team names as the header and 0 as
# values
for (j in teams) {
    all_data[, j] = 0  # Assign inital values 0 to each new column created in the loop
    print(j)
    
}
## [1] "ANA"
## [1] "FLA"
## [1] "N.J"
## [1] "VAN"
## [1] "ARI"
## [1] "CGY"
## [1] "MIN"
## [1] "NYR"
## [1] "TOR"
## [1] "BOS"
## [1] "WPG"
## [1] "BUF"
## [1] "CAR"
## [1] "OTT"
## [1] "PIT"
## [1] "STL"
## [1] "CBJ"
## [1] "DAL"
## [1] "CHI"
## [1] "COL"
## [1] "MTL"
## [1] "NSH"
## [1] "S.J"
## [1] "DET"
## [1] "EDM"
## [1] "L.A"
## [1] "T.B"
## [1] "NYI"
## [1] "PHI"
## [1] "WSH"
head(all_data, 5)  # Check the new columns created. 
##       Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.
## 1 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0 0.750
## 2 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4 0.915
## 3 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7 0.934
## 4 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2 0.897
## 5 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8 0.917
##    PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0      0
## 2  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1      0
## 3 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54     45
## 4  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0      0
## 5  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3      7
##     FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0    1
## 2 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2   51
## 3  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4   51
## 4   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0   15
## 5  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1   20
##   S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF
## 1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4   5   1
## 2      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949 939 749
## 3     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739 600 340
## 4      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324 328 198
## 5      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528 490 512
##    HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH
## 1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 2 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 3 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 4 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 5 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
print(all_data$team)
## NULL
# iterate through and record the teams for each player
for (i in 1:length(all_data$Team)) {
    teams_of_person = strsplit(as.character(all_data$Team[i]), 
        "/")[[1]]
    print(teams_of_person)
    for (x in teams_of_person) {
        all_data[, x][i] = 1  # Assign value 1 for each column_team with which player is associated
        # print('hello')
    }
}
## [1] "OTT"
## [1] "OTT"
## [1] "BUF"
## [1] "VAN"
## [1] "WSH"
## [1] "CHI"
## [1] "VAN"
## [1] "BUF"
## [1] "N.J"
## [1] "T.B"
## [1] "CHI"
## [1] "VAN"
## [1] "ARI" "MIN"
## [1] "CGY"
## [1] "FLA"
## [1] "CGY"
## [1] "PIT"
## [1] "EDM" "NYR"
## [1] "TOR"
## [1] "N.J"
## [1] "TOR"
## [1] "STL"
## [1] "TOR"
## [1] "T.B"
## [1] "BOS"
## [1] "BOS"
## [1] "COL"
## [1] "STL"
## [1] "PIT"
## [1] "BOS"
## [1] "PHI"
## [1] "CAR"
## [1] "ANA" "VAN"
## [1] "MTL"
## [1] "PHI"
## [1] "COL"
## [1] "NSH"
## [1] "CAR"
## [1] "DAL" "MTL"
## [1] "T.B"
## [1] "EDM"
## [1] "L.A"
## [1] "DAL"
## [1] "DET"
## [1] "MTL"
## [1] "CBJ"
## [1] "ANA"
## [1] "TOR"
## [1] "PHI"
## [1] "ARI"
## [1] "CAR"
## [1] "PHI"
## [1] "CAR"
## [1] "MIN"
## [1] "BOS"
## [1] "MIN"
## [1] "COL"
## [1] "EDM"
## [1] "NYI"
## [1] "BUF"
## [1] "DAL"
## [1] "CBJ"
## [1] "NYI"
## [1] "CHI"
## [1] "EDM"
## [1] "MIN"
## [1] "OTT"
## [1] "VAN"
## [1] "CGY"
## [1] "S.J"
## [1] "WPG"
## [1] "ANA"
## [1] "S.J"
## [1] "VAN"
## [1] "L.A" "MTL"
## [1] "MTL"
## [1] "WSH"
## [1] "NSH"
## [1] "L.A"
## [1] "ARI"
## [1] "DET"
## [1] "WSH"
## [1] "N.J"
## [1] "WSH"
## [1] "T.B"
## [1] "DET"
## [1] "PHI"
## [1] "CHI"
## [1] "N.J" "NSH" "VAN"
## [1] "ARI"
## [1] "ANA"
## [1] "FLA"
## [1] "ANA"
## [1] "CGY"
## [1] "CBJ"
## [1] "CBJ"
## [1] "OTT"
## [1] "N.J"
## [1] "MTL"
## [1] "WPG"
## [1] "VAN"
## [1] "COL"
## [1] "CHI"
## [1] "CHI"
## [1] "FLA"
## [1] "L.A"
## [1] "S.J"
## [1] "CGY"
## [1] "CAR"
## [1] "CAR"
## [1] "ARI"
## [1] "N.J"
## [1] "PHI"
## [1] "S.J"
## [1] "TOR"
## [1] "L.A"
## [1] "BUF"
## [1] "WPG"
## [1] "NYI"
## [1] "MIN"
## [1] "MIN"
## [1] "OTT" "VAN"
## [1] "BUF"
## [1] "CHI"
## [1] "PHI"
## [1] "NSH"
## [1] "PHI"
## [1] "BOS"
## [1] "T.B"
## [1] "CGY"
## [1] "MIN"
## [1] "FLA"
## [1] "L.A"
## [1] "N.J"
## [1] "COL" "S.J"
## [1] "NYI"
## [1] "COL" "MTL"
## [1] "NYI"
## [1] "COL"
## [1] "FLA"
## [1] "CBJ"
## [1] "BOS"
## [1] "PIT"
## [1] "NSH"
## [1] "PIT"
## [1] "PIT"
## [1] "N.J"
## [1] "EDM"
## [1] "STL"
## [1] "BOS"
## [1] "MIN"
## [1] "CGY"
## [1] "MTL"
## [1] "L.A"
## [1] "FLA"
## [1] "BOS"
## [1] "NYR"
## [1] "ARI" "CGY"
## [1] "WSH"
## [1] "S.J"
## [1] "S.J"
## [1] "BUF"
## [1] "N.J"
## [1] "T.B"
## [1] "EDM"
## [1] "CGY"
## [1] "OTT"
## [1] "ANA"
## [1] "PIT"
## [1] "MTL"
## [1] "CAR"
## [1] "CBJ"
## [1] "PHI"
## [1] "T.B"
## [1] "BOS"
## [1] "DAL"
## [1] "TOR"
## [1] "T.B"
## [1] "STL" "WSH"
## [1] "FLA" "TOR"
## [1] "CAR"
## [1] "FLA" "NYR"
## [1] "PHI"
## [1] "NYI"
## [1] "DET"
## [1] "DAL"
## [1] "EDM"
## [1] "BOS"
## [1] "VAN"
## [1] "OTT"
## [1] "DAL"
## [1] "MIN"
## [1] "VAN"
## [1] "OTT"
## [1] "CBJ"
## [1] "WPG"
## [1] "DET"
## [1] "PIT"
## [1] "BOS"
## [1] "S.J"
## [1] "BUF"
## [1] "PHI"
## [1] "L.A"
## [1] "ARI"
## [1] "L.A"
## [1] "CGY"
## [1] "S.J"
## [1] "WPG"
## [1] "NSH"
## [1] "MIN"
## [1] "COL"
## [1] "CAR"
## [1] "ANA"
## [1] "VAN"
## [1] "DET"
## [1] "ANA"
## [1] "MIN"
## [1] "ANA"
## [1] "BOS"
## [1] "DAL"
## [1] "WSH"
## [1] "NSH"
## [1] "COL"
## [1] "OTT"
## [1] "CBJ"
## [1] "DAL"
## [1] "PHI"
## [1] "CBJ"
## [1] "CGY"
## [1] "CGY"
## [1] "VAN"
## [1] "BOS"
## [1] "CHI"
## [1] "WSH"
## [1] "N.J"
## [1] "COL" "NSH"
## [1] "CGY"
## [1] "OTT"
## [1] "NYR"
## [1] "N.J"
## [1] "NYI"
## [1] "FLA"
## [1] "MIN"
## [1] "BOS"
## [1] "CBJ"
## [1] "N.J"
## [1] "ANA"
## [1] "EDM"
## [1] "S.J"
## [1] "CAR"
## [1] "MIN"
## [1] "T.B"
## [1] "BUF"
## [1] "ANA"
## [1] "MTL" "T.B"
## [1] "DET"
## [1] "MIN"
## [1] "CBJ"
## [1] "EDM"
## [1] "DET" "MTL"
## [1] "WSH"
## [1] "BUF"
## [1] "DET"
## [1] "PHI"
## [1] "NSH" "STL"
## [1] "PHI"
## [1] "ANA"
## [1] "DET"
## [1] "STL"
## [1] "WPG"
## [1] "BUF"
## [1] "DET"
## [1] "L.A"
## [1] "DET"
## [1] "S.J"
## [1] "BOS" "WPG"
## [1] "FLA"
## [1] "S.J"
## [1] "MIN"
## [1] "FLA"
## [1] "DET" "TOR"
## [1] "COL"
## [1] "WPG"
## [1] "N.J"
## [1] "S.J"
## [1] "WSH"
## [1] "T.B"
## [1] "N.J"
## [1] "ANA"
## [1] "NYR"
## [1] "DET"
## [1] "BOS"
## [1] "TOR"
## [1] "PIT"
## [1] "DET"
## [1] "CAR" "STL"
## [1] "FLA"
## [1] "L.A"
## [1] "BUF"
## [1] "CHI"
## [1] "NSH"
## [1] "CAR"
## [1] "OTT"
## [1] "MTL"
## [1] "CHI"
## [1] "ARI"
## [1] "CAR"
## [1] "CAR"
## [1] "DET"
## [1] "CBJ" "DAL"
## [1] "PHI" "PIT"
## [1] "S.J"
## [1] "NSH"
## [1] "WPG"
## [1] "ANA"
## [1] "PIT"
## [1] "T.B"
## [1] "EDM"
## [1] "CBJ"
## [1] "NYR"
## [1] "COL"
## [1] "CAR"
## [1] "BUF"
## [1] "EDM" "MTL"
## [1] "NSH"
## [1] "NYR"
## [1] "ARI" "MIN"
## [1] "ARI"
## [1] "T.B"
## [1] "PIT"
## [1] "BOS"
## [1] "OTT"
## [1] "N.J"
## [1] "OTT"
## [1] "NSH"
## [1] "STL"
## [1] "DAL"
## [1] "NYI"
## [1] "N.J"
## [1] "TOR"
## [1] "TOR"
## [1] "VAN"
## [1] "OTT" "S.J"
## [1] "BOS"
## [1] "T.B"
## [1] "EDM"
## [1] "STL"
## [1] "ARI"
## [1] "BUF"
## [1] "NSH"
## [1] "MTL"
## [1] "ARI" "NYR"
## [1] "N.J" "NSH"
## [1] "DAL"
## [1] "ANA"
## [1] "MTL"
## [1] "ANA"
## [1] "DAL"
## [1] "TOR"
## [1] "L.A"
## [1] "TOR"
## [1] "PIT"
## [1] "PHI"
## [1] "BOS"
## [1] "DAL"
## [1] "NSH"
## [1] "S.J"
## [1] "ARI"
## [1] "PIT"
## [1] "NSH"
## [1] "PHI"
## [1] "NSH"
## [1] "PIT"
## [1] "TOR"
## [1] "DAL"
## [1] "CBJ"
## [1] "VAN"
## [1] "MTL"
## [1] "CHI"
## [1] "NYI"
## [1] "ARI"
## [1] "L.A"
## [1] "BUF"
## [1] "OTT"
## [1] "ARI"
## [1] "DAL"
## [1] "L.A"
## [1] "WSH"
## [1] "CAR"
## [1] "CHI"
## [1] "NYR"
## [1] "EDM"
## [1] "MTL"
## [1] "VAN"
## [1] "WSH"
## [1] "BUF"
## [1] "ANA"
## [1] "COL"
## [1] "DAL"
## [1] "FLA"
## [1] "STL"
## [1] "FLA" "NYR"
## [1] "FLA"
## [1] "TOR"
## [1] "N.J"
## [1] "NYI"
## [1] "PIT"
## [1] "L.A"
## [1] "ARI"
## [1] "NSH"
## [1] "PIT"
## [1] "CAR" "PIT"
## [1] "COL" "MTL"
## [1] "ANA"
## [1] "BUF"
## [1] "NSH"
## [1] "OTT"
## [1] "NSH"
## [1] "CHI"
## [1] "STL"
## [1] "MTL"
## [1] "ARI"
## [1] "MIN"
## [1] "PIT"
## [1] "BUF"
## [1] "WSH"
## [1] "CAR"
## [1] "T.B"
## [1] "PHI"
## [1] "BUF"
## [1] "N.J"
## [1] "WPG"
## [1] "CBJ"
## [1] "MTL"
## [1] "ARI" "TOR"
## [1] "DAL" "MTL"
## [1] "NYI"
## [1] "BUF"
## [1] "MIN"
## [1] "VAN"
## [1] "DET" "FLA"
## [1] "PIT"
## [1] "BOS"
## [1] "TOR"
## [1] "CBJ" "N.J"
## [1] "COL" "TOR"
## [1] "N.J"
## [1] "VAN"
## [1] "VAN"
## [1] "PIT"
## [1] "CBJ"
## [1] "PHI"
## [1] "BOS"
## [1] "PIT"
## [1] "ANA"
## [1] "BOS"
## [1] "FLA"
## [1] "STL"
## [1] "STL"
## [1] "OTT"
## [1] "BUF"
## [1] "COL"
## [1] "ANA"
## [1] "COL"
## [1] "EDM"
## [1] "OTT"
## [1] "NSH"
## [1] "CGY"
## [1] "N.J" "NSH"
## [1] "BUF"
## [1] "NYI"
## [1] "CGY"
## [1] "FLA"
## [1] "S.J"
## [1] "ANA" "N.J"
## [1] "WPG"
## [1] "PIT"
## [1] "TOR"
## [1] "N.J"
## [1] "CAR"
## [1] "FLA"
## [1] "NYR"
## [1] "T.B" "TOR"
## [1] "FLA"
## [1] "ANA" "FLA"
## [1] "CGY" "OTT"
## [1] "NSH"
## [1] "CBJ"
## [1] "N.J"
## [1] "MTL"
## [1] "STL"
## [1] "T.B"
## [1] "WSH"
## [1] "FLA"
## [1] "NYR"
## [1] "VAN"
## [1] "CHI"
## [1] "NYI"
## [1] "N.J"
## [1] "TOR"
## [1] "PIT" "TOR"
## [1] "COL"
## [1] "NYR"
## [1] "NYI"
## [1] "MTL"
## [1] "NSH"
## [1] "PHI"
## [1] "DET"
## [1] "N.J"
## [1] "PHI"
## [1] "BOS"
## [1] "COL"
## [1] "CAR"
## [1] "MTL"
## [1] "T.B"
## [1] "WPG"
## [1] "WSH"
## [1] "CBJ"
## [1] "VAN"
## [1] "ARI"
## [1] "CGY"
## [1] "WSH"
## [1] "NYR"
## [1] "EDM"
## [1] "OTT"
## [1] "EDM"
## [1] "T.B"
## [1] "MIN"
## [1] "DET"
## [1] "L.A"
## [1] "COL"
## [1] "BOS"
## [1] "ARI"
## [1] "CBJ"
## [1] "VAN"
## [1] "BUF"
## [1] "CGY"
## [1] "WPG"
## [1] "MTL"
## [1] "CGY"
## [1] "STL"
## [1] "STL"
## [1] "MTL"
## [1] "COL"
## [1] "MIN"
## [1] "WPG"
## [1] "PHI" "T.B"
## [1] "CAR" "OTT"
## [1] "STL"
## [1] "DAL"
## [1] "WSH"
## [1] "BUF"
## [1] "NYI"
## [1] "MIN"
## [1] "S.J"
## [1] "DAL"
## [1] "NYI"
## [1] "EDM"
## [1] "NYR"
## [1] "NSH"
## [1] "MTL"
## [1] "T.B"
## [1] "STL"
## [1] "T.B"
## [1] "T.B"
## [1] "STL"
## [1] "PHI"
## [1] "NYR"
## [1] "VAN"
## [1] "WPG"
## [1] "NYR"
## [1] "WSH"
## [1] "N.J"
## [1] "STL"
## [1] "CHI"
## [1] "T.B"
## [1] "CAR"
## [1] "OTT"
## [1] "PHI"
## [1] "PIT"
## [1] "VAN"
## [1] "VAN"
## [1] "WPG"
## [1] "N.J"
## [1] "FLA"
## [1] "CGY"
## [1] "CAR"
## [1] "STL"
## [1] "NSH"
## [1] "ARI"
## [1] "ANA"
## [1] "NYI"
## [1] "ANA"
## [1] "OTT"
## [1] "PIT"
## [1] "CAR"
## [1] "NYI"
## [1] "NSH"
## [1] "NSH"
## [1] "WPG"
## [1] "CBJ"
## [1] "NSH"
## [1] "DET"
## [1] "NYI"
## [1] "S.J"
## [1] "NYI"
## [1] "VAN"
## [1] "L.A"
## [1] "MIN"
## [1] "S.J"
## [1] "MTL"
## [1] "STL"
## [1] "L.A"
## [1] "WSH"
## [1] "EDM"
## [1] "MTL"
## [1] "NYR"
## [1] "L.A"
## [1] "ARI" "WPG"
## [1] "DAL"
## [1] "OTT"
## [1] "NSH"
## [1] "ARI"
## [1] "N.J"
## [1] "MTL"
## [1] "PHI"
## [1] "L.A"
## [1] "STL" "WSH"
## [1] "CGY"
## [1] "PIT"
## [1] "NYI"
## [1] "N.J"
## [1] "DAL"
## [1] "COL" "MTL"
## [1] "BUF"
## [1] "EDM"
## [1] "FLA"
## [1] "COL"
## [1] "BOS"
## [1] "WSH"
## [1] "NYR"
## [1] "S.J"
## [1] "NYR"
## [1] "DAL"
## [1] "WPG"
## [1] "TOR"
## [1] "VAN"
## [1] "BUF"
## [1] "ANA"
## [1] "NSH"
## [1] "N.J"
## [1] "NSH"
## [1] "DET"
## [1] "BOS"
## [1] "DET"
## [1] "EDM"
## [1] "STL"
## [1] "L.A"
## [1] "DET" "NYR"
## [1] "CHI"
## [1] "WSH"
## [1] "ARI"
## [1] "CHI"
## [1] "STL"
## [1] "CBJ"
## [1] "PIT"
## [1] "ARI"
## [1] "BOS"
## [1] "MTL"
## [1] "S.J"
## [1] "COL" "L.A"
## [1] "CHI"
## [1] "MIN"
## [1] "T.B"
## [1] "CHI"
## [1] "T.B"
## [1] "CBJ"
## [1] "DAL"
## [1] "T.B"
## [1] "PIT"
## [1] "FLA" "T.B"
## [1] "ANA"
## [1] "OTT"
## [1] "PHI"
## [1] "MIN"
## [1] "EDM"
## [1] "WSH"
## [1] "NYI"
## [1] "S.J"
## [1] "DAL"
## [1] "DET"
## [1] "BUF"
## [1] "BUF" "NSH"
## [1] "ARI"
## [1] "EDM"
## [1] "BUF"
## [1] "S.J"
## [1] "DET"
## [1] "OTT"
## [1] "OTT"
## [1] "NYR"
## [1] "MTL"
## [1] "BUF"
## [1] "ANA"
## [1] "STL"
## [1] "DET"
## [1] "PIT"
## [1] "BOS"
## [1] "WPG"
## [1] "COL"
## [1] "NYR"
## [1] "EDM"
## [1] "DAL"
## [1] "NYR"
## [1] "L.A"
## [1] "STL"
## [1] "DAL"
## [1] "ANA"
## [1] "CAR"
## [1] "TOR"
## [1] "WSH"
## [1] "CGY"
## [1] "OTT"
## [1] "BUF"
## [1] "N.J"
## [1] "NYI"
## [1] "COL"
## [1] "TOR"
## [1] "CHI"
## [1] "ARI"
## [1] "WSH"
## [1] "L.A"
## [1] "T.B" "TOR"
## [1] "EDM"
## [1] "PHI"
## [1] "MTL"
## [1] "MTL"
## [1] "COL"
## [1] "ANA"
## [1] "CHI"
## [1] "MIN"
## [1] "OTT"
## [1] "STL"
## [1] "NYR" "OTT"
## [1] "EDM" "MTL"
## [1] "VAN"
## [1] "L.A"
## [1] "CAR"
## [1] "CGY"
## [1] "MTL"
## [1] "ANA"
## [1] "COL"
## [1] "COL"
## [1] "PIT"
## [1] "CGY"
## [1] "VAN"
## [1] "ANA"
## [1] "OTT"
## [1] "T.B"
## [1] "CGY" "OTT"
## [1] "WPG"
## [1] "NSH"
## [1] "TOR"
## [1] "PIT"
## [1] "S.J"
## [1] "COL"
## [1] "ARI"
## [1] "EDM"
## [1] "MIN"
## [1] "NYI"
## [1] "TOR"
## [1] "COL"
## [1] "DAL"
## [1] "CHI" "DET"
## [1] "FLA"
## [1] "EDM"
## [1] "CGY"
## [1] "PIT"
## [1] "ANA"
## [1] "EDM"
## [1] "EDM"
## [1] "N.J"
## [1] "VAN"
## [1] "WPG"
## [1] "S.J"
## [1] "L.A"
## [1] "WPG"
## [1] "NYR"
## [1] "WPG"
## [1] "STL"
## [1] "WPG"
## [1] "NYR"
## [1] "FLA"
## [1] "ARI" "MIN"
## [1] "T.B"
## [1] "BOS"
## [1] "S.J" "VAN"
## [1] "ARI"
## [1] "COL"
## [1] "WPG"
## [1] "DET"
## [1] "BUF"
## [1] "FLA"
## [1] "BUF"
## [1] "FLA"
## [1] "DET"
## [1] "DAL"
## [1] "COL"
## [1] "CBJ" "N.J"
## [1] "DET"
## [1] "CGY"
## [1] "NSH"
## [1] "PIT" "TOR"
## [1] "EDM"
## [1] "MIN"
## [1] "ANA"
## [1] "CHI"
## [1] "T.B"
## [1] "CHI" "DAL"
## [1] "MTL"
## [1] "NYR"
## [1] "DET"
## [1] "STL"
## [1] "L.A"
## [1] "S.J" "VAN"
## [1] "MIN"
## [1] "NYR"
## [1] "OTT"
## [1] "TOR"
## [1] "MIN"
## [1] "ANA" "DAL"
## [1] "BUF"
## [1] "BOS"
## [1] "ARI"
## [1] "N.J"
## [1] "WSH"
## [1] "T.B"
## [1] "BUF"
## [1] "CAR"
## [1] "NYI"
## [1] "PHI"
## [1] "CGY"
## [1] "NYI"
## [1] "NYR"
## [1] "WSH"
## [1] "WPG"
## [1] "NYI"
## [1] "OTT"
## [1] "OTT"
## [1] "CBJ"
## [1] "CAR"
## [1] "MIN"
## [1] "FLA"
## [1] "VAN"
## [1] "VAN"
## [1] "CBJ"
## [1] "S.J"
## [1] "WPG"
## [1] "WPG"
## [1] "CHI"
## [1] "PHI"
## [1] "ANA"
## [1] "WSH"
## [1] "PIT"
## [1] "DET"
## [1] "CAR"
## [1] "ARI"
## [1] "PIT"
## [1] "VAN"
## [1] "CBJ"
## [1] "VAN"
## [1] "CGY"
## [1] "CHI"
## [1] "OTT"
print(head(all_data))
##       Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.
## 1 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0 0.750
## 2 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4 0.915
## 3 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7 0.934
## 4 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2 0.897
## 5 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8 0.917
## 6 79-05-23    Strathroy    ON   CAN CAN 70 192  1997     6  156    L   Campbell      Brian        D  CHI 80  5 12  6  6  17   12   0.7  24   1896  88462 1473.7  18.43    18.43 32.4 23.9 9.8 0.936
##    PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0      0
## 2  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1      0
## 3 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54     45
## 4  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0      0
## 5  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3      7
## 6 1033 2.89  1.91 60.2   24    0.98 171   171 110  75    74    75  4.5    7   2   3   5  46.1    41.9  95.7  43    43 157 -114    35   59   11   83     59     11     83  6.5    0    0      0      0
##     FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0    1
## 2 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2   51
## 3  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4   51
## 4   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0   15
## 5  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1   20
## 6   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   1   1   0   0   0      0       0      2      0     0      0      2    0    1    6   28
##   S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF
## 1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4   5   1
## 2      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949 939 749
## 3     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739 600 340
## 4      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324 328 198
## 5      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528 490 512
## 6      2       0     32      9     1      0     30    12    11    12     8     -1   5.4  12   0     0    0    0 1356 1281  971  972 728 730 62.9 59.9 210 197 71 47  30  56  58  85  88 141 570 667 348
##    HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH
## 1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 2 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 3 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 4 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 5 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
## 6 707 223 168   76   60  0.6  3.7  4.3 3069.81  150  4.5 107.4 20.8  0.26   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0

Positions played

Splitting of the categorical position column into multiple booleans.

# Make position boolean columns
pos = c()
for (i in levels(all_data$Position)) {
    x = strsplit(i, "/")
    for (y in x) {
        pos = c(pos, y)
    }
}
pos = unique(pos)
print(pos)
## [1] "C"  "D"  "LW" "RW"
# add columns with the pos names as the header and 0 as
# values
for (position in pos) {
    all_data[, position] = 0
}

# iterate through and record the position(s) for each player
for (i in 1:length(all_data$Position)) {
    pos_of_person = strsplit(as.character(all_data$Position[i]), 
        "/")[[1]]
    for (x in pos_of_person) {
        all_data[, x][i] = 1
    }
}
print(head(all_data))
##       Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.
## 1 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0 0.750
## 2 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4 0.915
## 3 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7 0.934
## 4 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2 0.897
## 5 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8 0.917
## 6 79-05-23    Strathroy    ON   CAN CAN 70 192  1997     6  156    L   Campbell      Brian        D  CHI 80  5 12  6  6  17   12   0.7  24   1896  88462 1473.7  18.43    18.43 32.4 23.9 9.8 0.936
##    PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0      0
## 2  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1      0
## 3 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54     45
## 4  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0      0
## 5  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3      7
## 6 1033 2.89  1.91 60.2   24    0.98 171   171 110  75    74    75  4.5    7   2   3   5  46.1    41.9  95.7  43    43 157 -114    35   59   11   83     59     11     83  6.5    0    0      0      0
##     FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0    1
## 2 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2   51
## 3  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4   51
## 4   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0   15
## 5  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1   20
## 6   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   1   1   0   0   0      0       0      2      0     0      0      2    0    1    6   28
##   S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF
## 1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4   5   1
## 2      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949 939 749
## 3     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739 600 340
## 4      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324 328 198
## 5      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528 490 512
## 6      2       0     32      9     1      0     30    12    11    12     8     -1   5.4  12   0     0    0    0 1356 1281  971  972 728 730 62.9 59.9 210 197 71 47  30  56  58  85  88 141 570 667 348
##    HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 2 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 3 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 0
## 4 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 5 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 0 0
## 6 707 223 168   76   60  0.6  3.7  4.3 3069.81  150  4.5 107.4 20.8  0.26   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0 0 1
##   LW RW
## 1  0  0
## 2  0  0
## 3  0  1
## 4  0  0
## 5  0  1
## 6  0  0

Year, day and month of birth

We need to manipulate the date of birth into three columns: year of birth, month of birth, and day of birth.

# turn the born column into an age column 3 integer columns
# year:month:date
library(stringr)


# Objective: standardize year, month and day and create
# separate columns for each.
bday_parts = str_split_fixed(all_data$Born, "-", 3)

# adjust year column to account for missing digits
birth_year = c()  # A list created for storing players' year of birth
for (year in bday_parts[, 1]) {
    # Read from the first columns of bday_parts It is two digit
    # year, so payers born in 21st centry must be younger
    if (as.numeric(year) < 10) {
        yr = paste("20", year, sep = "")  # Players born in 21st century 
        birth_year = c(birth_year, yr)  # Store the new values in birth_year
    } else {
        yr = paste("19", year, sep = "")  # If player are not born in 21st century append 19 before the year. 
        birth_year = c(birth_year, yr)
    }
}

all_data$birth_year <- as.numeric(birth_year)  # Create separate column for YEAR & add to all_data
all_data$birth_month <- as.numeric(bday_parts[, 2])  # Create separate column for MONTH & add to all_data
all_data$birth_day <- as.numeric(bday_parts[, 3])  # Create separate column for DAY & add to all_data
head(all_data)
##       Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.
## 1 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0 0.750
## 2 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4 0.915
## 3 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7 0.934
## 4 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2 0.897
## 5 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8 0.917
## 6 79-05-23    Strathroy    ON   CAN CAN 70 192  1997     6  156    L   Campbell      Brian        D  CHI 80  5 12  6  6  17   12   0.7  24   1896  88462 1473.7  18.43    18.43 32.4 23.9 9.8 0.936
##    PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0      0
## 2  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1      0
## 3 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54     45
## 4  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0      0
## 5  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3      7
## 6 1033 2.89  1.91 60.2   24    0.98 171   171 110  75    74    75  4.5    7   2   3   5  46.1    41.9  95.7  43    43 157 -114    35   59   11   83     59     11     83  6.5    0    0      0      0
##     FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0    1
## 2 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2   51
## 3  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4   51
## 4   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0   15
## 5  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1   20
## 6   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   1   1   0   0   0      0       0      2      0     0      0      2    0    1    6   28
##   S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF
## 1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4   5   1
## 2      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949 939 749
## 3     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739 600 340
## 4      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324 328 198
## 5      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528 490 512
## 6      2       0     32      9     1      0     30    12    11    12     8     -1   5.4  12   0     0    0    0 1356 1281  971  972 728 730 62.9 59.9 210 197 71 47  30  56  58  85  88 141 570 667 348
##    HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 2 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 3 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 0
## 4 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 5 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 0 0
## 6 707 223 168   76   60  0.6  3.7  4.3 3069.81  150  4.5 107.4 20.8  0.26   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0 0 1
##   LW RW birth_year birth_month birth_day
## 1  0  0       1997           1        30
## 2  0  0       1993          12        21
## 3  0  1       1988           4        16
## 4  0  0       1992           1         7
## 5  0  1       1994           3        29
## 6  0  0       1979           5        23

Country and Nationality of players

# split Cntry and Nat to boolean columns

birth_country = levels(all_data$Cntry)
# add columns with the country of birth options note the
# Estonia for Uncle Leo
for (country in birth_country) {
    c = paste("born", country, sep = "_")
    
    all_data[, c] = 0
}

# iterate through and record the birth country of each player
for (i in 1:length(all_data$Cntry)) {
    birth_country = all_data$Cntry[i]
    c = paste("born", birth_country, sep = "_")
    all_data[, c][i] = 1
}


nationality = levels(all_data$Nat)
for (country in nationality) {
    c = paste("nation", country, sep = "_")
    all_data[, c] = 0
}

# iterate through and record the birth country of each player
for (i in 1:length(all_data$Nat)) {
    nationality = all_data$Nat[i]
    c = paste("nation", nationality, sep = "_")
    all_data[, c][i] = 1
}

head(all_data)
##       Born         City Pr.St Cntry Nat Ht  Wt DftYr DftRd Ovrl Hand  Last.Name First.Name Position Team GP  G  A A1 A2 PTS X...  E... PIM Shifts    TOI   TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.   SV.
## 1 97-01-30 Sainte-Marie    QC   CAN CAN 74 190  2015     1   18    L     Chabot     Thomas        D  OTT  1  0  0  0  0   0   -2   0.0   0     13    429    7.2   7.15     7.16 15.2  0.0 0.0 0.750
## 2 93-12-21       Ottawa    ON   CAN CAN 74 207  2012     1   15    R       Ceci       Cody        D  OTT 79  2 15  6  9  17  -11 -10.4  20   2418 109992 1826.2  23.20    23.17 39.0 30.4 7.4 0.915
## 3 88-04-16     St. Paul    MN   USA USA 72 218  2006     1    7    R     Okposo       Kyle       RW  BUF 65 19 26 13 13  45   -7  -1.4  24   1443  73983 1229.2  18.97    18.95 33.1 63.4 9.7 0.934
## 4 92-01-07       Ottawa    ON   CAN CAN 77 220  2010     1    3    R Gudbranson       Erik        D  VAN 30  1  5  5  0   6  -14  -5.3  18    765  36603  607.9  20.33    20.31 36.1 37.5 6.2 0.897
## 5 94-03-29      Toronto    ON   CAN CAN 76 217  2012     1   16    R     Wilson        Tom       RW  WSH 82  7 12  4  8  19    9   4.1 133   1453  63592 1059.7  12.93    12.93 23.5 61.3 7.8 0.917
## 6 79-05-23    Strathroy    ON   CAN CAN 70 192  1997     6  156    L   Campbell      Brian        D  CHI 80  5 12  6  6  17   12   0.7  24   1896  88462 1473.7  18.43    18.43 32.4 23.9 9.8 0.936
##    PDO F.60  A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2  ixG iSCF iRB iRS iDS sDist sDist.1  Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1  750 0.00 16.74  0.0   -2  -16.74   2     2   2   1     1     1  0.0    0   0   0   0  43.0    49.3   0.0   1     1   0    1     1    1    0    0      1      0      0  0.0    0    0      0      0
## 2  989 1.84  2.79 39.7  -29   -0.95 287   287 197 143   143   143  6.1    7   7   9  16  52.4    46.3 138.1 111   111 154  -43    54   74   22  159     74     22    159  8.0    1    0      1      0
## 3 1031 3.47  1.95 64.0   31    1.51 283   283 212 155   156   156 17.4   64  16  20  36  28.4    26.3 196.8  53    53  68  -15    57   36   26   25     36     26     25  2.4   54   45     54     45
## 4  959 1.58  3.45 31.4  -19   -1.88  88    88  55  40    40    40  1.4    2   1   4   5  55.1    51.0 153.0  66    66  66    0    15   23    4   44     23      4     44  7.3    0    0      0      0
## 5  995 1.76  2.32 43.1  -10   -0.57 166   166 118  95    95    95  9.3   35   8  10  18  30.9    26.4  96.3 239   239 134  105    23   21   36   44     21     36     44  4.4    3    7      3      7
## 6 1033 2.89  1.91 60.2   24    0.98 171   171 110  75    74    75  4.5    7   2   3   5  46.1    41.9  95.7  43    43 157 -114    35   59   11   83     59     11     83  6.5    0    0      0      0
##     FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   0   0   0   0      0       0      0      0     0      0      0    0    0    0    1
## 2 100.0   0.1     1     0     0     0     0     0      0      0        0        0         1         0   0   0   0   0   0   0      0       0      1      0     0      0      1    0    1    2   51
## 3  54.5   7.4     9     6    10    11    35    28     13     10       21       16        37        33   1   5   2   0   0   0      5       2      0      3     0      0      9    0    2    4   51
## 4   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   0   1   0   0   0      0       0      0      0     0      0      1    0    0    0   15
## 5  30.0   1.0     1     1     0     2     2     4      3      1        0        3         2         6   0   2   0   1   0   0      0       0      0      0     1      0      6    0    2    1   20
## 6   0.0   0.0     0     0     0     0     0     0      0      0        0        0         0         0   0   1   1   0   0   0      0       0      2      0     0      0      2    0    1    6   28
##   S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf   NPD Min Maj Match Misc Game   CF   CA   FF   FA  SF  SA  xGF  xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL  HF
## 1      0       0      1      0     0      0      0     0     0     0     0      0   0.0   0   0     0    0    0    9   12    8   10   5   8  0.5  0.9   2   3  0  2   1   1   0   1   1   2   4   5   1
## 2      2       0     49     12     0      1     79    10     6    10     5     -4   2.2  10   0     0    0    0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85  68  82  79  94 147 176 949 939 749
## 3     19       3      3     20     8      2    101    12    10    11     8     -2  -0.5  12   0     0    0    0 1301 1051  986  826 734 606 70.8 46.4 235 133 71 40  60  34  76  52 136  86 739 600 340
## 4      0       0     18      3     0      0     19     6     7     6     6      1   2.7   4   2     0    0    0  460  605  339  467 259 340 22.0 33.6  80 130 16 35  27  20  29  32  56  52 324 328 198
## 5      7       2      3     10    11      1     61    44    33    40    29    -11 -14.3  33   9     0    1    1  766  992  546  720 398 495 33.5 47.5 124 159 31 41  30  37  43  53  73  90 528 490 512
## 6      2       0     32      9     1      0     30    12    11    12     8     -1   5.4  12   0     0    0    0 1356 1281  971  972 728 730 62.9 59.9 210 197 71 47  30  56  58  85  88 141 570 667 348
##    HA GVA TKA PENT PEND  OPS  DPS   PS    OTOI Grit  DAP  Pace   GS  GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1   2   1   1    1    1  0.0 -0.2 -0.2   40.03    1  0.0 175.7 -0.4 -0.38   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 2 671 284 197  104   98 -0.2  3.4  3.2 2850.59  290 13.3 112.5 14.1  0.18   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 3 351 168 129   56   70  3.7  1.3  5.0 2486.75  102  6.6 114.8 36.8  0.57   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 0
## 4 197  86  59   26   22  0.0  0.4  0.5 1074.41  130 17.5 105.1  5.9  0.20   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 0 1
## 5 422 157 126   88   68 -0.1  1.4  1.3 3459.09  425  8.3  99.5 21.8  0.27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 0 0
## 6 707 223 168   76   60  0.6  3.7  4.3 3069.81  150  4.5 107.4 20.8  0.26   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0 0 1
##   LW RW birth_year birth_month birth_day born_AUT born_CAN born_CHE born_CZE born_DEU born_DNK born_EST born_FIN born_FRA born_GBR born_HRV born_ITA born_LVA born_NOR born_RUS born_SVK born_SWE
## 1  0  0       1997           1        30        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
## 2  0  0       1993          12        21        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
## 3  0  1       1988           4        16        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
## 4  0  0       1992           1         7        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
## 5  0  1       1994           3        29        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
## 6  0  0       1979           5        23        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
##   born_USA born_SVN nation_AUT nation_CAN nation_CHE nation_CZE nation_DEU nation_DNK nation_FIN nation_FRA nation_GBR nation_HRV nation_LVA nation_NOR nation_RUS nation_SVK nation_SWE nation_USA
## 1        0        0          0          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0
## 2        0        0          0          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0
## 3        1        0          0          0          0          0          0          0          0          0          0          0          0          0          0          0          0          1
## 4        0        0          0          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0
## 5        0        0          0          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0
## 6        0        0          0          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0
##   nation_SVN
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0

Numerical columns imputation

all_data$undrafted = is.na(all_data$DftRd)

# fill median values loop through the dataframe, filling each
# column with the median of the existing values for the
# entire dataset where are there still missing values?

all_missing_list = colnames(all_data)[colSums(is.na(all_data)) > 
    0]

length(all_missing_list) == 0  # Flag to check NA values
## [1] FALSE
# if above true all values are imputed!

for (i in 1:length(all_missing_list)) {
    # get the global median
    median_all <- median(all_data[, all_missing_list[i]], na.rm = TRUE)  # Neglect NA when calculating #+ median
    print(median_all)
    # imput the missing values with the column's median
    all_data[, all_missing_list[i]][is.na(all_data[, all_missing_list[i]])] <- median_all
}
## [1] 2010
## [1] 2
## [1] 47
## [1] 866.5
## [1] 26.9
## [1] 54.5
## [1] 8.1
## [1] 0.917
## [1] 999
## [1] 2.3
## [1] 2.43
## [1] 0
## [1] 137
## [1] 105
## [1] 74
## [1] 5.3
## [1] 12
## [1] 4
## [1] 6
## [1] 10
## [1] 29.2
## [1] 109.4
## [1] 49
## [1] 59
## [1] -1
## [1] 19
## [1] 15
## [1] 27
## [1] 4.3
## [1] 2
## [1] 2
## [1] 0.9
## [1] 9
## [1] 6
## [1] 749
## [1] 790
## [1] 552
## [1] 591
## [1] 397
## [1] 425
## [1] 33.4
## [1] 36.1
## [1] 111
## [1] 117
## [1] 30
## [1] 35
## [1] 25
## [1] 27
## [1] 35
## [1] 36
## [1] 387
## [1] 397
## [1] 318
## [1] 319
## [1] 116
## [1] 85
## [1] 51
## [1] 45
## [1] 2436.61
## [1] 109.2
## [1] 15.7
## [1] 0.31
all_missing_list <- colnames(all_data)[colSums(is.na(all_data))]

length(all_missing_list) == 0  # Flag to check NA values
## [1] TRUE

EDA (Exploratory Data Analysis)

How many players from each country?

barplot(sort(table(all_data$Nat), decreasing = TRUE), horiz = TRUE, 
    las = 1, col = c("red", "blue4", "blue", "red3", "skyblue"), 
    main = "Number of NHL players from each country", ylab = "Country", 
    xlab = "count")

Age Breakdown

table(all_data$birth_year)
## 
## 1972 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 
##    1    2    2    6   11   12    9   21   22   27   29   43   44   59   63   59   77   71   77   74   51   50   34   15   11    4

Age Histogram

hist(all_data$birth_year, breaks = 28, col = "skyblue", xlab = "Year of birth", 
    main = "Distribution of NHL players by birth year (2016/2017 season)\nA.K.A. Jaromir Jagr the ageless one")

Salary Distribution

summary(train.df$Salary)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   575000   742500   925000  2264509  3500000 13800000
hist(train.df$Salary, breaks = 52, col = "salmon", xlab = "Salary", 
    ylab = "Number of players", main = "NHL Salary Distribution: 2016/2017")

Compare Salary with Ice-Performance (No Goals)

plot(train.df$G, train.df$Salary, xlab = "No. Goals", pch = 20, 
    ylab = "Money Earned")

abline(lm(train.df$Salary ~ train.df$G), col = "blue")

plot(train.df$G, train.df$Salary, pch = 20, xlab = "goals scored", 
    ylab = "money earned", main = "Who are the outliers?")
abline(lm(train.df$Salary ~ train.df$G), col = "red")
text(train.df$G, train.df$Salary, labels = train.df$Last.Name, 
    cex = 0.7, pos = 3)

Final train and test dataset

# train.df2$Salary <- train.df$Salary train.df3 <-
# cbind(train.df[,1], all_data[1:612,])
train.df2 <- all_data[c(1:612), ]


train.final <- all_data[c(1:612), ]
train.final$Salary <- train.df$Salary

test.final <- all_data[c(613:874), ]

Effects of goals and age on salary

Before ploting 3D plot, let us preapre a new dataset from the all_data to have a clean train data set.

Correlation

pairs(~Born + +Ht + Wt + DftRd + Position + Team + GP + G, data = train.df2, 
    main = "Simple Scatterplot Matrix")

str(train.final)
## 'data.frame':    612 obs. of  228 variables:
##  $ Born       : Factor w/ 806 levels "72-02-15","77-03-18",..: 555 435 169 326 458 8 273 423 357 410 ...
##  $ City       : Factor w/ 490 levels "Aalborg","Albany",..: 269 225 305 225 324 312 310 324 277 235 ...
##  $ Pr.St      : Factor w/ 39 levels "INT","AB","AK",..: 32 29 17 29 29 29 1 29 29 29 ...
##  $ Cntry      : Factor w/ 19 levels "AUT","CAN","CHE",..: 2 2 18 2 2 2 17 2 2 2 ...
##  $ Nat        : Factor w/ 17 levels "AUT","CAN","CHE",..: 2 2 16 2 2 2 15 2 2 2 ...
##  $ Ht         : int  74 74 72 77 76 70 71 70 72 68 ...
##  $ Wt         : int  190 207 218 220 217 192 185 183 214 178 ...
##  $ DftYr      : int  2015 2012 2006 2010 2012 1997 2009 2010 2010 2011 ...
##  $ DftRd      : int  1 1 1 1 1 6 2 2 2 7 ...
##  $ Ovrl       : int  18 15 7 3 16 156 53 47 42 201 ...
##  $ Hand       : Factor w/ 2 levels "L","R": 1 2 2 2 2 1 1 2 2 1 ...
##  $ Last.Name  : Factor w/ 801 levels "Abdelkader","Aberg",..: 78 76 373 192 562 67 434 435 483 390 ...
##  $ First.Name : Factor w/ 385 levels "A.J.","Aaron",..: 276 55 163 95 278 39 18 96 79 187 ...
##  $ Position   : Factor w/ 20 levels "C","C/D","C/LW",..: 7 7 14 7 14 7 14 10 16 1 ...
##  $ Team       : Factor w/ 78 levels "ANA","ANA/FLA",..: 52 52 12 66 68 22 66 12 45 63 ...
##  $ GP         : int  1 79 65 30 82 80 3 30 53 10 ...
##  $ G          : int  0 2 19 1 7 5 0 4 4 1 ...
##  $ A          : int  0 15 26 5 12 12 1 2 5 1 ...
##  $ A1         : int  0 6 13 5 4 6 0 2 2 1 ...
##  $ A2         : int  0 9 13 0 8 6 1 0 3 0 ...
##  $ PTS        : int  0 17 45 6 19 17 1 6 9 2 ...
##  $ X...       : int  -2 -11 -7 -14 9 12 1 -7 -19 -3 ...
##  $ E...       : num  0 -10.4 -1.4 -5.3 4.1 0.7 0.3 -4.1 -7.9 -2.6 ...
##  $ PIM        : int  0 20 24 18 133 24 0 4 12 2 ...
##  $ Shifts     : int  13 2418 1443 765 1453 1896 40 506 975 159 ...
##  $ TOI        : int  429 109992 73983 36603 63592 88462 1604 23265 42950 6867 ...
##  $ TOIX       : num  7.2 1826.2 1229.2 607.9 1059.7 ...
##  $ TOI.GP     : num  7.15 23.2 18.97 20.33 12.93 ...
##  $ TOI.GP.1   : num  7.16 23.17 18.95 20.31 12.93 ...
##  $ TOI.       : num  15.2 39 33.1 36.1 23.5 32.4 19.1 23.5 24.2 23.4 ...
##  $ IPP.       : num  0 30.4 63.4 37.5 61.3 23.9 100 75 81.8 100 ...
##  $ SH.        : num  0 7.4 9.7 6.2 7.8 9.8 14.3 4.7 4 4.4 ...
##  $ SV.        : num  0.75 0.915 0.934 0.897 0.917 0.936 1 0.925 0.924 0.907 ...
##  $ PDO        : num  750 989 1031 959 995 ...
##  $ F.60       : num  0 1.84 3.47 1.58 1.76 2.89 2.24 1.24 0.92 1.05 ...
##  $ A.60       : num  16.74 2.79 1.95 3.45 2.32 ...
##  $ Pct.       : num  0 39.7 64 31.4 43.1 60.2 100 32 27.5 28.6 ...
##  $ Diff       : int  -2 -29 31 -19 -10 24 1 -9 -18 -3 ...
##  $ Diff.60    : num  -16.74 -0.95 1.51 -1.88 -0.57 ...
##  $ iCF        : int  2 287 283 88 166 171 5 94 108 24 ...
##  $ iCF.1      : int  2 287 283 88 166 171 5 94 109 24 ...
##  $ iFF        : int  2 197 212 55 118 110 3 74 97 17 ...
##  $ iSF        : int  1 143 155 40 95 75 2 51 74 9 ...
##  $ iSF.1      : int  1 143 156 40 95 74 2 51 76 9 ...
##  $ iSF.2      : int  1 143 156 40 95 75 2 51 76 9 ...
##  $ ixG        : num  0 6.1 17.4 1.4 9.3 4.5 0.1 2.8 7.8 0.9 ...
##  $ iSCF       : int  0 7 64 2 35 7 0 9 31 3 ...
##  $ iRB        : int  0 7 16 1 8 2 0 1 4 0 ...
##  $ iRS        : int  0 9 20 4 10 3 0 6 1 1 ...
##  $ iDS        : num  0 16 36 5 18 5 0 7 5 1 ...
##  $ sDist      : num  43 52.4 28.4 55.1 30.9 46.1 33.5 37.7 26.6 24.9 ...
##  $ sDist.1    : num  49.3 46.3 26.3 51 26.4 41.9 38.2 37.6 25.4 28.4 ...
##  $ Pass       : num  0 138.1 196.8 153 96.3 ...
##  $ iHF        : int  1 111 53 66 239 43 0 12 94 9 ...
##  $ iHF.1      : int  1 111 53 66 239 43 0 12 94 9 ...
##  $ iHA        : int  0 154 68 66 134 157 2 20 53 16 ...
##  $ iHDf       : num  1 -43 -15 0 105 -114 -2 -8 41 -7 ...
##  $ iMiss      : int  1 54 57 15 23 35 1 23 24 8 ...
##  $ iGVA       : int  1 74 36 23 21 59 1 9 14 2 ...
##  $ iTKA       : int  0 22 26 4 36 11 2 12 17 0 ...
##  $ iBLK       : int  0 159 25 44 44 83 1 12 32 2 ...
##  $ iGVA.1     : int  1 74 36 23 21 59 1 9 14 2 ...
##  $ iTKA.1     : int  0 22 26 4 36 11 2 12 17 0 ...
##  $ iBLK.1     : int  0 159 25 44 44 83 1 12 32 2 ...
##  $ BLK.       : num  0 8 2.4 7.3 4.4 6.5 4.3 3.2 4.4 2.2 ...
##  $ iFOW       : int  0 1 54 0 3 0 0 104 29 32 ...
##  $ iFOL       : int  0 0 45 0 7 0 0 146 42 40 ...
##  $ iFOW.1     : int  0 1 54 0 3 0 0 104 29 32 ...
##  $ iFOL.1     : int  0 0 45 0 7 0 0 146 42 40 ...
##  $ FO.        : num  0 100 54.5 0 30 0 0 41.6 40.8 44.4 ...
##  $ X.FOT      : num  0 0.1 7.4 0 1 0 0 79.4 11.5 74.2 ...
##  $ dzFOW      : int  0 1 9 0 1 0 0 27 7 8 ...
##  $ dzFOL      : int  0 0 6 0 1 0 0 37 9 10 ...
##  $ nzFOW      : int  0 0 10 0 0 0 0 37 9 13 ...
##  $ nzFOL      : int  0 0 11 0 2 0 0 53 12 17 ...
##  $ ozFOW      : int  0 0 35 0 2 0 0 40 13 11 ...
##  $ ozFOL      : int  0 0 28 0 4 0 0 56 21 13 ...
##  $ FOW.Up     : int  0 0 13 0 3 0 0 36 10 16 ...
##  $ FOL.Up     : int  0 0 10 0 1 0 0 46 18 14 ...
##  $ FOW.Down   : int  0 0 21 0 0 0 0 39 10 7 ...
##  $ FOL.Down   : int  0 0 16 0 3 0 0 56 11 10 ...
##  $ FOW.Close  : int  0 1 37 0 2 0 0 59 17 18 ...
##  $ FOL.Close  : int  0 0 33 0 6 0 0 90 26 23 ...
##  $ OTG        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ X1G        : int  0 0 5 0 2 1 0 1 2 0 ...
##  $ GWG        : int  0 0 2 1 0 1 0 0 0 0 ...
##  $ ENG        : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ PSG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PSA        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ G.Bkhd     : int  0 0 5 0 0 0 0 0 2 0 ...
##  $ G.Dflct    : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ G.Slap     : int  0 1 0 0 0 2 0 0 0 0 ...
##  $ G.Snap     : int  0 0 3 0 0 0 0 1 0 0 ...
##  $ G.Tip      : int  0 0 0 0 1 0 0 0 2 0 ...
##  $ G.Wrap     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ G.Wrst     : int  0 1 9 1 6 2 0 3 0 1 ...
##  $ CBar       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Post       : int  0 1 2 0 2 1 0 0 0 0 ...
##  $ Over       : int  0 2 4 0 1 6 0 2 3 0 ...
##   [list output truncated]
str(test.final)
## 'data.frame':    262 obs. of  227 variables:
##  $ Born       : Factor w/ 806 levels "72-02-15","77-03-18",..: 616 578 346 799 736 606 712 144 199 781 ...
##  $ City       : Factor w/ 490 levels "Aalborg","Albany",..: 421 250 303 460 450 23 428 324 437 457 ...
##  $ Pr.St      : Factor w/ 39 levels "INT","AB","AK",..: 26 1 18 1 26 17 1 29 2 1 ...
##  $ Cntry      : Factor w/ 19 levels "AUT","CAN","CHE",..: 18 4 18 8 18 18 15 2 2 17 ...
##  $ Nat        : Factor w/ 17 levels "AUT","CAN","CHE",..: 16 4 16 7 16 16 13 2 2 15 ...
##  $ Ht         : int  72 72 75 72 72 74 73 75 72 73 ...
##  $ Wt         : int  216 195 227 182 196 210 180 210 192 205 ...
##  $ DftYr      : int  2003 2014 2007 2013 2011 2002 2010 2001 2003 2012 ...
##  $ DftRd      : int  1 1 6 2 2 4 1 1 3 1 ...
##  $ Ovrl       : int  13 13 161 55 36 129 8 2 74 11 ...
##  $ Hand       : Factor w/ 2 levels "L","R": 2 1 1 1 2 2 1 2 1 2 ...
##  $ Last.Name  : Factor w/ 801 levels "Abdelkader","Aberg",..: 59 792 716 707 605 650 594 777 713 642 ...
##  $ First.Name : Factor w/ 385 levels "A.J.","Aaron",..: 331 124 227 314 3 278 8 129 325 334 ...
##  $ Position   : Factor w/ 20 levels "C","C/D","C/LW",..: 17 9 9 9 7 7 5 1 9 11 ...
##  $ Team       : Factor w/ 78 levels "ANA","ANA/FLA",..: 40 68 34 43 51 40 70 28 52 48 ...
##  $ GP         : int  80 21 81 73 31 18 49 68 4 82 ...
##  $ G          : int  14 3 27 18 2 1 5 15 0 31 ...
##  $ A          : int  22 3 15 10 9 4 11 35 0 27 ...
##  $ A1         : int  9 2 9 3 2 4 8 15 0 17 ...
##  $ A2         : int  13 1 6 7 7 0 3 20 0 10 ...
##  $ PTS        : int  36 6 42 28 11 5 16 50 0 58 ...
##  $ X...       : int  -4 2 13 -1 3 -4 -7 -18 -1 -4 ...
##  $ E...       : num  8.2 0.4 15 3 3.6 0.8 -10.6 -3.7 0.4 2.7 ...
##  $ PIM        : int  22 2 95 8 17 6 12 29 0 32 ...
##  $ Shifts     : int  1729 291 1715 1488 658 393 834 1530 69 2080 ...
##  $ TOI        : int  76801 13997 81345 60702 29406 16693 39266 65977 3237 91109 ...
##  $ TOIX       : num  1278 233 1352 1010 490 ...
##  $ TOI.GP     : num  16 11.1 16.7 13.9 15.8 ...
##  $ TOI.GP.1   : num  16 11.1 16.7 13.8 15.8 ...
##  $ TOI.       : num  27.2 22 30 24.9 28.7 26.7 24.1 27.9 23.5 30.9 ...
##  $ IPP.       : num  65.5 66.7 54.5 63.6 47.8 83.3 66.7 66.7 0 69.9 ...
##  $ SH.        : num  8.5 6.7 9.5 8.5 9.3 4.6 9.3 11.6 0 9.6 ...
##  $ SV.        : num  0.898 0.969 0.919 0.922 0.909 0.917 0.919 0.871 0.95 0.905 ...
##  $ PDO        : num  982 1037 1014 1007 1002 ...
##  $ F.60       : num  2.58 2.32 3.42 2.61 2.82 1.31 2.21 4.1 0 3.3 ...
##  $ A.60       : num  2.77 0.77 2.09 2.26 2.08 1.96 2.58 3.55 1.11 2.75 ...
##  $ Pct.       : num  48.2 75 62.1 53.7 57.5 40 46.2 53.6 0 54.6 ...
##  $ Diff       : int  -4 6 30 6 6 -3 -4 10 -1 14 ...
##  $ Diff.60    : num  -0.19 1.54 1.33 0.36 0.73 -0.65 -0.37 0.55 -1.11 0.56 ...
##  $ iCF        : int  326 56 300 279 89 30 89 285 13 467 ...
##  $ iCF.1      : int  326 56 300 279 89 30 89 285 13 467 ...
##  $ iFF        : int  251 49 243 208 58 21 64 220 9 362 ...
##  $ iSF        : int  175 32 178 158 39 12 45 149 5 235 ...
##  $ iSF.1      : int  175 32 178 158 38 12 45 149 5 234 ...
##  $ iSF.2      : int  175 32 178 158 38 12 45 149 5 234 ...
##  $ ixG        : num  19.7 5.2 25.1 17 1.7 0.6 4.9 14.9 0.4 25.4 ...
##  $ iSCF       : int  73 23 109 58 1 2 17 51 0 76 ...
##  $ iRB        : int  19 3 23 13 0 0 8 9 0 18 ...
##  $ iRS        : int  19 2 17 20 2 1 7 12 0 30 ...
##  $ iDS        : num  38 5 40 33 2 1 15 21 0 48 ...
##  $ sDist      : num  28.2 21 21.8 28.1 59.1 47.9 35.3 29.7 34.8 31.3 ...
##  $ sDist.1    : num  27 23.2 21.7 26.1 41.7 50.7 31.3 29.7 30.1 28 ...
##  $ Pass       : num  198.5 57.7 154.6 81.4 24.2 ...
##  $ iHF        : int  190 6 189 72 22 10 60 13 4 97 ...
##  $ iHF.1      : int  190 6 189 72 22 10 60 13 4 97 ...
##  $ iHA        : int  151 11 109 113 66 34 38 43 3 88 ...
##  $ iHDf       : num  39 -5 80 -41 -44 -24 22 -30 1 9 ...
##  $ iMiss      : int  76 17 65 50 19 9 19 71 4 127 ...
##  $ iGVA       : int  27 9 49 20 20 7 14 49 1 62 ...
##  $ iTKA       : int  25 4 33 25 7 1 12 32 1 76 ...
##  $ iBLK       : int  31 7 23 25 27 28 19 31 1 26 ...
##  $ iGVA.1     : int  27 9 49 20 20 7 14 49 1 62 ...
##  $ iTKA.1     : int  25 4 33 25 7 1 12 32 1 76 ...
##  $ iBLK.1     : int  31 7 23 25 27 28 19 31 1 26 ...
##  $ BLK.       : num  2.8 3.9 2.1 2.9 7.6 12.7 3 3.2 2.7 2 ...
##  $ iFOW       : int  2 0 6 2 0 0 178 354 0 5 ...
##  $ iFOL       : int  3 3 9 2 0 0 259 324 1 13 ...
##  $ iFOW.1     : int  2 0 6 2 0 0 178 354 0 5 ...
##  $ iFOL.1     : int  3 3 9 2 0 0 259 324 1 13 ...
##  $ FO.        : num  40 0 40 50 0 0 40.7 52.2 0 27.8 ...
##  $ X.FOT      : num  0.4 1.7 1.2 0.4 0 0 78.9 59.1 2.3 1.1 ...
##  $ dzFOW      : int  2 0 0 0 0 0 52 81 0 1 ...
##  $ dzFOL      : int  2 0 0 1 0 0 99 105 0 1 ...
##  $ nzFOW      : int  0 0 0 1 0 0 71 101 0 0 ...
##  $ nzFOL      : int  0 0 1 0 0 0 96 89 0 2 ...
##  $ ozFOW      : int  0 0 6 1 0 0 55 172 0 4 ...
##  $ ozFOL      : int  1 3 8 1 0 0 64 130 1 10 ...
##  $ FOW.Up     : int  2 0 4 1 0 0 52 106 0 1 ...
##  $ FOL.Up     : int  2 2 3 1 0 0 75 101 0 3 ...
##  $ FOW.Down   : int  0 0 1 0 0 0 62 125 0 0 ...
##  $ FOL.Down   : int  1 0 2 0 0 0 111 105 1 7 ...
##  $ FOW.Close  : int  0 0 3 1 0 0 100 213 0 5 ...
##  $ FOL.Close  : int  2 1 6 2 0 0 135 211 0 5 ...
##  $ OTG        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ X1G        : int  2 2 8 2 0 0 1 2 0 3 ...
##  $ GWG        : int  1 2 5 3 0 0 0 4 0 9 ...
##  $ ENG        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ PSG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PSA        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ G.Bkhd     : int  1 0 2 3 0 0 1 2 0 2 ...
##  $ G.Dflct    : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ G.Slap     : int  0 0 0 4 1 0 0 1 0 3 ...
##  $ G.Snap     : int  4 1 1 2 1 1 0 5 0 8 ...
##  $ G.Tip      : int  2 2 2 0 0 0 0 0 0 1 ...
##  $ G.Wrap     : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ G.Wrst     : int  7 0 20 7 0 0 4 7 0 17 ...
##  $ CBar       : int  0 0 0 0 0 0 0 2 0 1 ...
##  $ Post       : int  2 1 5 2 0 1 0 6 0 6 ...
##  $ Over       : int  3 2 3 3 0 0 4 13 2 23 ...
##   [list output truncated]
colnames(train.final)
##   [1] "Born"        "City"        "Pr.St"       "Cntry"       "Nat"         "Ht"          "Wt"          "DftYr"       "DftRd"       "Ovrl"        "Hand"        "Last.Name"   "First.Name" 
##  [14] "Position"    "Team"        "GP"          "G"           "A"           "A1"          "A2"          "PTS"         "X..."        "E..."        "PIM"         "Shifts"      "TOI"        
##  [27] "TOIX"        "TOI.GP"      "TOI.GP.1"    "TOI."        "IPP."        "SH."         "SV."         "PDO"         "F.60"        "A.60"        "Pct."        "Diff"        "Diff.60"    
##  [40] "iCF"         "iCF.1"       "iFF"         "iSF"         "iSF.1"       "iSF.2"       "ixG"         "iSCF"        "iRB"         "iRS"         "iDS"         "sDist"       "sDist.1"    
##  [53] "Pass"        "iHF"         "iHF.1"       "iHA"         "iHDf"        "iMiss"       "iGVA"        "iTKA"        "iBLK"        "iGVA.1"      "iTKA.1"      "iBLK.1"      "BLK."       
##  [66] "iFOW"        "iFOL"        "iFOW.1"      "iFOL.1"      "FO."         "X.FOT"       "dzFOW"       "dzFOL"       "nzFOW"       "nzFOL"       "ozFOW"       "ozFOL"       "FOW.Up"     
##  [79] "FOL.Up"      "FOW.Down"    "FOL.Down"    "FOW.Close"   "FOL.Close"   "OTG"         "X1G"         "GWG"         "ENG"         "PSG"         "PSA"         "G.Bkhd"      "G.Dflct"    
##  [92] "G.Slap"      "G.Snap"      "G.Tip"       "G.Wrap"      "G.Wrst"      "CBar"        "Post"        "Over"        "Wide"        "S.Bkhd"      "S.Dflct"     "S.Slap"      "S.Snap"     
## [105] "S.Tip"       "S.Wrap"      "S.Wrst"      "iPenT"       "iPenD"       "iPENT"       "iPEND"       "iPenDf"      "NPD"         "Min"         "Maj"         "Match"       "Misc"       
## [118] "Game"        "CF"          "CA"          "FF"          "FA"          "SF"          "SA"          "xGF"         "xGA"         "SCF"         "SCA"         "GF"          "GA"         
## [131] "RBF"         "RBA"         "RSF"         "RSA"         "DSF"         "DSA"         "FOW"         "FOL"         "HF"          "HA"          "GVA"         "TKA"         "PENT"       
## [144] "PEND"        "OPS"         "DPS"         "PS"          "OTOI"        "Grit"        "DAP"         "Pace"        "GS"          "GS.G"        "ANA"         "FLA"         "N.J"        
## [157] "VAN"         "ARI"         "CGY"         "MIN"         "NYR"         "TOR"         "BOS"         "WPG"         "BUF"         "CAR"         "OTT"         "PIT"         "STL"        
## [170] "CBJ"         "DAL"         "CHI"         "COL"         "MTL"         "NSH"         "S.J"         "DET"         "EDM"         "L.A"         "T.B"         "NYI"         "PHI"        
## [183] "WSH"         "C"           "D"           "LW"          "RW"          "birth_year"  "birth_month" "birth_day"   "born_AUT"    "born_CAN"    "born_CHE"    "born_CZE"    "born_DEU"   
## [196] "born_DNK"    "born_EST"    "born_FIN"    "born_FRA"    "born_GBR"    "born_HRV"    "born_ITA"    "born_LVA"    "born_NOR"    "born_RUS"    "born_SVK"    "born_SWE"    "born_USA"   
## [209] "born_SVN"    "nation_AUT"  "nation_CAN"  "nation_CHE"  "nation_CZE"  "nation_DEU"  "nation_DNK"  "nation_FIN"  "nation_FRA"  "nation_GBR"  "nation_HRV"  "nation_LVA"  "nation_NOR" 
## [222] "nation_RUS"  "nation_SVK"  "nation_SWE"  "nation_USA"  "nation_SVN"  "undrafted"   "Salary"
cor(train.final$Salary, train.final$birth_year)
## [1] -0.425583

Removing Outliers

First determine the Interquartile range(IQR) for the feature for which you want to remove outlier. Here let us remove outlier from the birth year and replace the outlier point with maximum.

First do the boxplot to chec kthe outliers.

boxplot(train.final$birth_year, data = train.final)

You can note couple of points at the lower end. These outliers can be removed or replaced with suitable value. Here we will replace the outlierw with suitable value (Q1) Q1: Value of 1st quartile Q3: Value of 3rd quartile

new_value_birth_year = Q1 - 1.5×IQR

If the outliers were at top end then, new_value_birth_year = Q3 + 1.5 IQR.

Let us find the values of IQR, Q1, Q3, and new_value_birth_year

summary(train.final$birth_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1972    1987    1991    1990    1993    1998
## Min. 1st Qu.  Median Mean 3rd Qu.  Max.  1972 1987 1991
## 1990 1993 1998 From above we have
Q1 <- 1987
Q3 <- 1993
IQR <- Q3 - Q1

new_value_birth_year <- Q1 - (1.5 * IQR)
new_value_birth_year
## [1] 1978

Now replace the outliers with new_value

for (i in 1:nrow(train.final)) {
    if (train.final$birth_year[i] < new_value_birth_year) 
        train.final$birth_year[i] <- new_value_birth_year
}

Again plot hte boxplot to see if the outliers exist. Ideally the outliers should be gone.

boxplot(train.final$birth_year, data = train.final)

Linear Regression (1st Approch)

STEPS to model and predict dependent Variable

*3.1 Split the training data into train and test data for validations

*3.2 Run the Linear gression model

*3.3 Predict uisng new model.

*3.4 Check accuracy.

Train and test data

train.final <- train.final[, -c(1, 2, 12, 13, 14, 15)]
test.final <- test.final[, -c(1, 2, 12, 13, 14, 15)]

train.final <- train.final[1:612, ]
test.final <- train.final[613:874, ]
dim(train.final)
## [1] 612 222
dim(test.final)
## [1] 262 222

Normalization

  • Data normalization can be achieved using Z-transform. Please note, there are many ways to normalize a given numerical data, z-transform being onely one among them.

Normalized value of Height

# Normalize Height (Ht)

mean_ht <- mean(train.final$Ht)  # Store mean value of the columns
std_ht <- sd(train.final$Ht)  # Store the standard deviation of the columns
mean_ht
## [1] 72.98366
std_ht
## [1] 2.08016
for (i in 1:nrow(train.final)) {
    # A for loop to compute the normalized value of each row
    # element of given column
    train.final$Ht_n[i] <- (train.final$Ht[i] - mean(train.final$Ht))/sd(train.final$Ht)
}
head(train.final$Ht_n)
## [1]  0.4885874  0.4885874 -0.4728772  1.9307843  1.4500520 -1.4343418
# mean(train.final$Ht_n)

Normalized value of Weight

# Normalize Weight (Wt)

mean_wt <- mean(train.final$Wt)  # Store mean value of the columns
std_wt <- sd(train.final$Wt)  # Store the standard deviation of the columns
mean_wt
## [1] 200.7451
std_wt
## [1] 14.95242
for (i in 1:nrow(train.final)) {
    # A for loop to compute the normalized value of each row
    # element of given column
    train.final$Wt_n[i] <- (train.final$Wt[i] - mean(train.final$Wt))/sd(train.final$Wt)
}
head(train.final$Wt_n)
## [1] -0.7186193  0.4183204  1.1539873  1.2877449  1.0871085 -0.5848617
# mean(train.final$Ht_n)

Normalize the valued of the birth_year

# Normalize Weight (Wt)
mean_birth_year <- mean(train.final$birth_year)  # Store mean value of the columns
std_birth_year <- sd(train.final$birth_year)  # Store the standard deviation of the columns
mean_birth_year
## [1] 1990.052
std_birth_year
## [1] 4.430649
for (i in 1:nrow(train.final)) {
    # A for loop to compute the normalized value of each row
    # element of given column
    train.final$birth_year_n[i] <- (train.final$birth_year[i] - 
        mean(train.final$birth_year))/sd(train.final$birth_year)
}
head(train.final$birth_year_n)
## [1]  1.5681027  0.6653004 -0.4632025  0.4395998  0.8910010 -2.4945077
# mean(train.final$Ht_n)

Model

Based on the initial EDA, we have identified 4 independ variables for this linear regression model namely Height, Weight, Ovrl, Goal(G), and Birth Year. We are passing the normalized values of the same to the linear model.

lm_model <- lm(Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
    data = train.final)
lm_model
## 
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
##     data = train.final)
## 
## Coefficients:
##  (Intercept)          Ht_n          Wt_n          Ovrl             G  birth_year_n  
##      1715122         33198        225698         -6197        121757       -878801
summary(lm_model)
## 
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
##     data = train.final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4538603  -980546  -155020   810226  9119438 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1715122     125844  13.629  < 2e-16 ***
## Ht_n            33198      98116   0.338   0.7352    
## Wt_n           225698      98702   2.287   0.0226 *  
## Ovrl            -6197       1205  -5.141 3.69e-07 ***
## G              121757       7881  15.450  < 2e-16 ***
## birth_year_n  -878801      70309 -12.499  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1636000 on 606 degrees of freedom
## Multiple R-squared:  0.4689, Adjusted R-squared:  0.4646 
## F-statistic:   107 on 5 and 606 DF,  p-value: < 2.2e-16

Predict

predicted_salary <- predict(lm_model, train.final)
# predicted_salary

Check Accuracy

actual_preds <- data.frame(cbind(actuals = train.final$Salary, 
    predicteds = predicted_salary))
corrleation_accuracy <- cor(actual_preds)
corrleation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.6847967
## predicteds 0.6847967  1.0000000
head(actual_preds)
##   actuals predicteds
## 1  925000    79559.3
## 2 2250000  1391652.5
## 3 8000000  4636946.7
## 4 3500000  1786708.4
## 5 1750000  1978758.3
## 6 1500000  3369764.9

Linear Regression (2nd Approach)

Train and test data (80:20 ratio)

# train.final <- train.final[,-c(1,2,12,13,14,15)] test.final
# <- test.final[,-c(1,2,12,13,14,15)]

data = train.final  # Store the train data before spliting

split_indexes <- sample(1:nrow(data), size = 0.2 * nrow(data))

test <- data[split_indexes, ]
train <- data[-split_indexes, ]

dim(train)
## [1] 490 225
dim(test)
## [1] 122 225

Model

lm_model <- lm(Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
    data = train)
lm_model
## 
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
##     data = train)
## 
## Coefficients:
##  (Intercept)          Ht_n          Wt_n          Ovrl             G  birth_year_n  
##      1755235         35542        226425         -6381        119534       -886782
summary(lm_model)
## 
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4480124  -942693  -147147   848789  7989836 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1755235     138631  12.661  < 2e-16 ***
## Ht_n            35542     109377   0.325    0.745    
## Wt_n           226425     108845   2.080    0.038 *  
## Ovrl            -6381       1272  -5.015 7.46e-07 ***
## G              119535       8660  13.802  < 2e-16 ***
## birth_year_n  -886782      75427 -11.757  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1609000 on 484 degrees of freedom
## Multiple R-squared:  0.4819, Adjusted R-squared:  0.4765 
## F-statistic: 90.02 on 5 and 484 DF,  p-value: < 2.2e-16

Predict

predicted_salary <- predict(lm_model, test)

Check Accuracy

actual_preds <- data.frame(cbind(actuals = test$Salary, predicteds = predicted_salary))
corrleation_accuracy <- cor(actual_preds)
corrleation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.6482224
## predicteds 0.6482224  1.0000000
head(actual_preds)
##     actuals predicteds
## 518  700000  2732206.3
## 93   825000  1392633.5
## 194  700000   659478.5
## 158 4000000  1858637.2
## 490  875000  2436304.0
## 8    842500  1023810.6

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.