This is a R Markdown document to predict the salary of National Hockey League players. For more details on using R Markdown see http://rmarkdown.rstudio.com. This code islinear regression model implementation on NHL data set available on kaggle. I am thank full to kernel avilable at https://www.kaggle.com/camnugent/nhl-salary-data-prediction-cleaning-and-modeling prepared by Cam Nuget.
NOTE: Download the train and test data from the above link.
# install.packages('plyr') install.packages('stringr')
# install.packages('magrittr')
# install.packages('scatterplot3d')
library(tidyverse)
library(plyr)
library(magrittr)
library(stringr)
library(scatterplot3d)
setwd("D:/2018/Upwork/JB125")
Loading the data requires calling suitable R functions. The function depends on the type of file. For example, our dataset is in comma separated foramt i.e. csv; therefore, we can use R function “read.csv()”" as demonstrated below.
train.df <- read.csv("./train.csv", header = TRUE)
colnames(train.df) # Check column names
## [1] "Salary" "Born" "City" "Pr.St" "Cntry" "Nat" "Ht" "Wt" "DftYr" "DftRd" "Ovrl" "Hand" "Last.Name" "First.Name" "Position"
## [16] "Team" "GP" "G" "A" "A1" "A2" "PTS" "X..." "E..." "PIM" "Shifts" "TOI" "TOIX" "TOI.GP" "TOI.GP.1"
## [31] "TOI." "IPP." "SH." "SV." "PDO" "F.60" "A.60" "Pct." "Diff" "Diff.60" "iCF" "iCF.1" "iFF" "iSF" "iSF.1"
## [46] "iSF.2" "ixG" "iSCF" "iRB" "iRS" "iDS" "sDist" "sDist.1" "Pass" "iHF" "iHF.1" "iHA" "iHDf" "iMiss" "iGVA"
## [61] "iTKA" "iBLK" "iGVA.1" "iTKA.1" "iBLK.1" "BLK." "iFOW" "iFOL" "iFOW.1" "iFOL.1" "FO." "X.FOT" "dzFOW" "dzFOL" "nzFOW"
## [76] "nzFOL" "ozFOW" "ozFOL" "FOW.Up" "FOL.Up" "FOW.Down" "FOL.Down" "FOW.Close" "FOL.Close" "OTG" "X1G" "GWG" "ENG" "PSG" "PSA"
## [91] "G.Bkhd" "G.Dflct" "G.Slap" "G.Snap" "G.Tip" "G.Wrap" "G.Wrst" "CBar" "Post" "Over" "Wide" "S.Bkhd" "S.Dflct" "S.Slap" "S.Snap"
## [106] "S.Tip" "S.Wrap" "S.Wrst" "iPenT" "iPenD" "iPENT" "iPEND" "iPenDf" "NPD" "Min" "Maj" "Match" "Misc" "Game" "CF"
## [121] "CA" "FF" "FA" "SF" "SA" "xGF" "xGA" "SCF" "SCA" "GF" "GA" "RBF" "RBA" "RSF" "RSA"
## [136] "DSF" "DSA" "FOW" "FOL" "HF" "HA" "GVA" "TKA" "PENT" "PEND" "OPS" "DPS" "PS" "OTOI" "Grit"
## [151] "DAP" "Pace" "GS" "GS.G"
head(train.df, 5)
## Salary Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH.
## 1 925000 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0
## 2 2250000 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4
## 3 8000000 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7
## 4 3500000 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2
## 5 1750000 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8
## SV. PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1
## 1 0.750 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0
## 2 0.915 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1
## 3 0.934 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54
## 4 0.897 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0
## 5 0.917 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3
## iFOL.1 FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over
## 1 0 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2
## 3 45 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4
## 4 0 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5 7 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1
## Wide S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW
## 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4
## 2 51 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949
## 3 51 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739
## 4 15 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324
## 5 20 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528
## FOL HF HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G
## 1 5 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38
## 2 939 749 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18
## 3 600 340 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57
## 4 328 198 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20
## 5 490 512 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27
test.df <- read.csv("./test.csv", header = TRUE)
colnames(test.df) # Check column names
## [1] "Born" "City" "Pr.St" "Cntry" "Nat" "Ht" "Wt" "DftYr" "DftRd" "Ovrl" "Hand" "Last.Name" "First.Name" "Position" "Team"
## [16] "GP" "G" "A" "A1" "A2" "PTS" "X..." "E..." "PIM" "Shifts" "TOI" "TOIX" "TOI.GP" "TOI.GP.1" "TOI."
## [31] "IPP." "SH." "SV." "PDO" "F.60" "A.60" "Pct." "Diff" "Diff.60" "iCF" "iCF.1" "iFF" "iSF" "iSF.1" "iSF.2"
## [46] "ixG" "iSCF" "iRB" "iRS" "iDS" "sDist" "sDist.1" "Pass" "iHF" "iHF.1" "iHA" "iHDf" "iMiss" "iGVA" "iTKA"
## [61] "iBLK" "iGVA.1" "iTKA.1" "iBLK.1" "BLK." "iFOW" "iFOL" "iFOW.1" "iFOL.1" "FO." "X.FOT" "dzFOW" "dzFOL" "nzFOW" "nzFOL"
## [76] "ozFOW" "ozFOL" "FOW.Up" "FOL.Up" "FOW.Down" "FOL.Down" "FOW.Close" "FOL.Close" "OTG" "X1G" "GWG" "ENG" "PSG" "PSA" "G.Bkhd"
## [91] "G.Dflct" "G.Slap" "G.Snap" "G.Tip" "G.Wrap" "G.Wrst" "CBar" "Post" "Over" "Wide" "S.Bkhd" "S.Dflct" "S.Slap" "S.Snap" "S.Tip"
## [106] "S.Wrap" "S.Wrst" "iPenT" "iPenD" "iPENT" "iPEND" "iPenDf" "NPD" "Min" "Maj" "Match" "Misc" "Game" "CF" "CA"
## [121] "FF" "FA" "SF" "SA" "xGF" "xGA" "SCF" "SCA" "GF" "GA" "RBF" "RBA" "RSF" "RSA" "DSF"
## [136] "DSA" "FOW" "FOL" "HF" "HA" "GVA" "TKA" "PENT" "PEND" "OPS" "DPS" "PS" "OTOI" "Grit" "DAP"
## [151] "Pace" "GS" "GS.G"
head(test.df, 5)
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV. PDO
## 1 88-11-05 Ithaca NY USA USA 72 216 2003 1 13 R Brown Dustin RW/LW L.A 80 14 22 9 13 36 -4 8.2 22 1729 76801 1278.5 16.00 15.99 27.2 65.5 8.5 0.898 982
## 2 00-02-29 Prague CZE CZE 72 195 2014 1 13 L Vrana Jakub LW WSH 21 3 3 2 1 6 2 0.4 2 291 13997 233.2 11.12 11.11 22.0 66.7 6.7 0.969 1037
## 3 92-04-24 St. Louis MO USA USA 75 227 2007 6 161 L Maroon Patrick LW EDM 81 27 15 9 6 42 13 15.0 95 1715 81345 1351.9 16.73 16.72 30.0 54.5 9.5 0.919 1014
## 4 99-07-05 Piikkio FIN FIN 72 182 2013 2 55 L Lehkonen Artturi LW MTL 73 18 10 3 7 28 -1 3.0 8 1488 60702 1010.0 13.87 13.85 24.9 63.6 8.5 0.922 1007
## 5 96-10-27 Niagara Falls NY USA USA 72 196 2011 2 36 R Clendening Adam D NYR 31 2 9 2 7 11 3 3.6 17 658 29406 490.2 15.82 15.81 28.7 47.8 9.3 0.909 1002
## F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1 FO.
## 1 2.58 2.77 48.2 -4 -0.19 326 326 251 175 175 175 19.7 73 19 19 38 28.2 27.0 198.5 190 190 151 39 76 27 25 31 27 25 31 2.8 2 3 2 3 40
## 2 2.32 0.77 75.0 6 1.54 56 56 49 32 32 32 5.2 23 3 2 5 21.0 23.2 57.7 6 6 11 -5 17 9 4 7 9 4 7 3.9 0 3 0 3 0
## 3 3.42 2.09 62.1 30 1.33 300 300 243 178 178 178 25.1 109 23 17 40 21.8 21.7 154.6 189 189 109 80 65 49 33 23 49 33 23 2.1 6 9 6 9 40
## 4 2.61 2.26 53.7 6 0.36 279 279 208 158 158 158 17.0 58 13 20 33 28.1 26.1 81.4 72 72 113 -41 50 20 25 25 20 25 25 2.9 2 2 2 2 50
## 5 2.82 2.08 57.5 6 0.73 89 89 58 39 38 38 1.7 1 0 2 2 59.1 41.7 24.2 22 22 66 -44 19 20 7 27 20 7 27 7.6 0 0 0 0 0
## X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide S.Bkhd
## 1 0.4 2 2 0 0 0 1 2 2 0 1 0 2 0 2 1 0 0 0 1 0 0 4 2 0 7 0 2 3 71 15
## 2 1.7 0 0 0 0 0 3 0 2 0 0 0 1 0 2 2 0 0 0 0 0 0 1 2 0 0 0 1 2 14 3
## 3 1.2 0 0 0 1 6 8 4 3 1 2 3 6 0 8 5 0 0 0 2 1 0 1 2 1 20 0 5 3 57 17
## 4 0.4 0 1 1 0 1 1 1 1 0 0 1 2 0 2 3 0 0 0 3 1 4 2 0 1 7 0 2 3 45 16
## 5 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 19 1
## S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF HA GVA
## 1 1 5 39 13 2 100 11 25 10 19 14 11.3 11 0 0 0 0 1239 1114 927 821 649 577 60.8 52.4 203 184 55 59 62 39 57 73 119 112 639 652 651 542 194
## 2 0 4 2 7 0 16 1 2 1 1 1 0.5 1 0 0 0 0 282 179 201 132 134 98 14.2 8.0 53 24 9 3 11 8 13 15 24 23 91 85 75 83 42
## 3 2 5 36 16 6 96 34 19 34 19 -15 -13.0 25 9 0 0 0 1500 1091 1116 791 809 581 82.0 53.2 322 186 77 47 55 31 65 77 120 108 555 661 511 486 298
## 4 3 10 17 6 9 97 4 8 4 8 4 3.8 4 0 0 0 0 1006 877 714 655 518 489 47.2 39.0 163 140 44 38 34 29 72 49 106 78 521 461 399 499 160
## 5 0 7 15 0 0 15 7 4 7 4 -3 -0.2 6 1 0 0 0 508 353 372 267 247 187 24.2 16.7 77 44 23 17 14 8 19 21 33 29 239 227 175 211 87
## TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G
## 1 83 72 65 1.9 1.8 3.7 3418.06 243 19.5 110.4 47.3 0.59
## 2 26 16 10 0.3 0.3 0.6 826.49 15 10.0 118.6 8.9 0.42
## 3 184 77 86 3.9 2.0 6.0 3155.13 316 8.9 115.0 52.5 0.65
## 4 95 44 40 2.3 1.1 3.4 3044.03 105 24.3 111.9 38.1 0.52
## 5 60 30 25 0.8 1.1 1.9 1217.17 67 4.8 105.4 15.9 0.51
Before cleaning data let us combine the two set of data as most of the features are same, with exception of “Salary” in the train dataset. “Salary” is the predictor variable (dependent). We will make a model to predict salary of players.
Let us check number of columns in each dataset.
dim.data.frame(train.df)
## [1] 612 154
dim.data.frame(test.df)
## [1] 262 153
Combine train and test datasets.
# test_x$TrainTest <- 'test' train_y$TrainTest <- 'train'
train_data <- train.df[, -c(1)]
dim(train_data)
## [1] 612 153
test_data <- test.df
dim(test_data)
## [1] 262 153
# test <-cbind(test_y, test_x)
all_data <- rbind(train_data, test_data)
#######
colnames(all_data)
## [1] "Born" "City" "Pr.St" "Cntry" "Nat" "Ht" "Wt" "DftYr" "DftRd" "Ovrl" "Hand" "Last.Name" "First.Name" "Position" "Team"
## [16] "GP" "G" "A" "A1" "A2" "PTS" "X..." "E..." "PIM" "Shifts" "TOI" "TOIX" "TOI.GP" "TOI.GP.1" "TOI."
## [31] "IPP." "SH." "SV." "PDO" "F.60" "A.60" "Pct." "Diff" "Diff.60" "iCF" "iCF.1" "iFF" "iSF" "iSF.1" "iSF.2"
## [46] "ixG" "iSCF" "iRB" "iRS" "iDS" "sDist" "sDist.1" "Pass" "iHF" "iHF.1" "iHA" "iHDf" "iMiss" "iGVA" "iTKA"
## [61] "iBLK" "iGVA.1" "iTKA.1" "iBLK.1" "BLK." "iFOW" "iFOL" "iFOW.1" "iFOL.1" "FO." "X.FOT" "dzFOW" "dzFOL" "nzFOW" "nzFOL"
## [76] "ozFOW" "ozFOL" "FOW.Up" "FOL.Up" "FOW.Down" "FOL.Down" "FOW.Close" "FOL.Close" "OTG" "X1G" "GWG" "ENG" "PSG" "PSA" "G.Bkhd"
## [91] "G.Dflct" "G.Slap" "G.Snap" "G.Tip" "G.Wrap" "G.Wrst" "CBar" "Post" "Over" "Wide" "S.Bkhd" "S.Dflct" "S.Slap" "S.Snap" "S.Tip"
## [106] "S.Wrap" "S.Wrst" "iPenT" "iPenD" "iPENT" "iPEND" "iPenDf" "NPD" "Min" "Maj" "Match" "Misc" "Game" "CF" "CA"
## [121] "FF" "FA" "SF" "SA" "xGF" "xGA" "SCF" "SCA" "GF" "GA" "RBF" "RBA" "RSF" "RSA" "DSF"
## [136] "DSA" "FOW" "FOL" "HF" "HA" "GVA" "TKA" "PENT" "PEND" "OPS" "DPS" "PS" "OTOI" "Grit" "DAP"
## [151] "Pace" "GS" "GS.G"
dim(all_data)
## [1] 874 153
Check the columns with missing values.
all_missing_list = colnames(all_data)[colSums(is.na(all_data)) >
0]
print(all_missing_list)
## [1] "DftYr" "DftRd" "Ovrl" "TOIX" "TOI." "IPP." "SH." "SV." "PDO" "F.60" "A.60" "Diff.60" "iCF" "iFF" "iSF" "ixG" "iSCF" "iRB" "iRS"
## [20] "iDS" "sDist.1" "Pass" "iHF.1" "iHA" "iHDf" "iGVA.1" "iTKA.1" "iBLK.1" "BLK." "iFOW.1" "iFOL.1" "X.FOT" "iPENT" "iPEND" "CF" "CA" "FF" "FA"
## [39] "SF" "SA" "xGF" "xGA" "SCF" "SCA" "GF" "GA" "RBF" "RBA" "RSF" "RSA" "FOW" "FOL" "HF" "HA" "GVA" "TKA" "PENT"
## [58] "PEND" "OTOI" "Pace" "GS" "GS.G"
library(plyr)
# fill the Pr.St column with 'INT' for international players
all_data$Pr.St = mapvalues(all_data$Pr.St, from = "", to = "INT")
Team column states which team a player plyed for. Some players have multiple teams they played for. We will split each time into its own boolean predictor and those who player for multiple teams are recorded accordingly.
# Make team boolean columns get the unique list of team
# acronymns
teams = c() # A list
for (i in levels(all_data$Team)) {
x = strsplit(i, "/") # Split the string and store the values as list in 'x'
# print(x)
for (y in x) {
teams = c(teams, y) # Combine all the values of x in a list 'teams'
# print(y) print(teams)
}
}
teams = unique(teams) # assign unique entires to list teams
print(teams)
## [1] "ANA" "FLA" "N.J" "VAN" "ARI" "CGY" "MIN" "NYR" "TOR" "BOS" "WPG" "BUF" "CAR" "OTT" "PIT" "STL" "CBJ" "DAL" "CHI" "COL" "MTL" "NSH" "S.J" "DET" "EDM" "L.A" "T.B" "NYI" "PHI" "WSH"
# add columns with the team names as the header and 0 as
# values
for (j in teams) {
all_data[, j] = 0 # Assign inital values 0 to each new column created in the loop
print(j)
}
## [1] "ANA"
## [1] "FLA"
## [1] "N.J"
## [1] "VAN"
## [1] "ARI"
## [1] "CGY"
## [1] "MIN"
## [1] "NYR"
## [1] "TOR"
## [1] "BOS"
## [1] "WPG"
## [1] "BUF"
## [1] "CAR"
## [1] "OTT"
## [1] "PIT"
## [1] "STL"
## [1] "CBJ"
## [1] "DAL"
## [1] "CHI"
## [1] "COL"
## [1] "MTL"
## [1] "NSH"
## [1] "S.J"
## [1] "DET"
## [1] "EDM"
## [1] "L.A"
## [1] "T.B"
## [1] "NYI"
## [1] "PHI"
## [1] "WSH"
head(all_data, 5) # Check the new columns created.
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV.
## 1 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0 0.750
## 2 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4 0.915
## 3 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7 0.934
## 4 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2 0.897
## 5 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8 0.917
## PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0 0
## 2 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1 0
## 3 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54 45
## 4 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0 0
## 5 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3 7
## FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2 51
## 3 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4 51
## 4 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 15
## 5 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1 20
## S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF
## 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4 5 1
## 2 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949 939 749
## 3 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739 600 340
## 4 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324 328 198
## 5 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528 490 512
## HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH
## 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
print(all_data$team)
## NULL
# iterate through and record the teams for each player
for (i in 1:length(all_data$Team)) {
teams_of_person = strsplit(as.character(all_data$Team[i]),
"/")[[1]]
print(teams_of_person)
for (x in teams_of_person) {
all_data[, x][i] = 1 # Assign value 1 for each column_team with which player is associated
# print('hello')
}
}
## [1] "OTT"
## [1] "OTT"
## [1] "BUF"
## [1] "VAN"
## [1] "WSH"
## [1] "CHI"
## [1] "VAN"
## [1] "BUF"
## [1] "N.J"
## [1] "T.B"
## [1] "CHI"
## [1] "VAN"
## [1] "ARI" "MIN"
## [1] "CGY"
## [1] "FLA"
## [1] "CGY"
## [1] "PIT"
## [1] "EDM" "NYR"
## [1] "TOR"
## [1] "N.J"
## [1] "TOR"
## [1] "STL"
## [1] "TOR"
## [1] "T.B"
## [1] "BOS"
## [1] "BOS"
## [1] "COL"
## [1] "STL"
## [1] "PIT"
## [1] "BOS"
## [1] "PHI"
## [1] "CAR"
## [1] "ANA" "VAN"
## [1] "MTL"
## [1] "PHI"
## [1] "COL"
## [1] "NSH"
## [1] "CAR"
## [1] "DAL" "MTL"
## [1] "T.B"
## [1] "EDM"
## [1] "L.A"
## [1] "DAL"
## [1] "DET"
## [1] "MTL"
## [1] "CBJ"
## [1] "ANA"
## [1] "TOR"
## [1] "PHI"
## [1] "ARI"
## [1] "CAR"
## [1] "PHI"
## [1] "CAR"
## [1] "MIN"
## [1] "BOS"
## [1] "MIN"
## [1] "COL"
## [1] "EDM"
## [1] "NYI"
## [1] "BUF"
## [1] "DAL"
## [1] "CBJ"
## [1] "NYI"
## [1] "CHI"
## [1] "EDM"
## [1] "MIN"
## [1] "OTT"
## [1] "VAN"
## [1] "CGY"
## [1] "S.J"
## [1] "WPG"
## [1] "ANA"
## [1] "S.J"
## [1] "VAN"
## [1] "L.A" "MTL"
## [1] "MTL"
## [1] "WSH"
## [1] "NSH"
## [1] "L.A"
## [1] "ARI"
## [1] "DET"
## [1] "WSH"
## [1] "N.J"
## [1] "WSH"
## [1] "T.B"
## [1] "DET"
## [1] "PHI"
## [1] "CHI"
## [1] "N.J" "NSH" "VAN"
## [1] "ARI"
## [1] "ANA"
## [1] "FLA"
## [1] "ANA"
## [1] "CGY"
## [1] "CBJ"
## [1] "CBJ"
## [1] "OTT"
## [1] "N.J"
## [1] "MTL"
## [1] "WPG"
## [1] "VAN"
## [1] "COL"
## [1] "CHI"
## [1] "CHI"
## [1] "FLA"
## [1] "L.A"
## [1] "S.J"
## [1] "CGY"
## [1] "CAR"
## [1] "CAR"
## [1] "ARI"
## [1] "N.J"
## [1] "PHI"
## [1] "S.J"
## [1] "TOR"
## [1] "L.A"
## [1] "BUF"
## [1] "WPG"
## [1] "NYI"
## [1] "MIN"
## [1] "MIN"
## [1] "OTT" "VAN"
## [1] "BUF"
## [1] "CHI"
## [1] "PHI"
## [1] "NSH"
## [1] "PHI"
## [1] "BOS"
## [1] "T.B"
## [1] "CGY"
## [1] "MIN"
## [1] "FLA"
## [1] "L.A"
## [1] "N.J"
## [1] "COL" "S.J"
## [1] "NYI"
## [1] "COL" "MTL"
## [1] "NYI"
## [1] "COL"
## [1] "FLA"
## [1] "CBJ"
## [1] "BOS"
## [1] "PIT"
## [1] "NSH"
## [1] "PIT"
## [1] "PIT"
## [1] "N.J"
## [1] "EDM"
## [1] "STL"
## [1] "BOS"
## [1] "MIN"
## [1] "CGY"
## [1] "MTL"
## [1] "L.A"
## [1] "FLA"
## [1] "BOS"
## [1] "NYR"
## [1] "ARI" "CGY"
## [1] "WSH"
## [1] "S.J"
## [1] "S.J"
## [1] "BUF"
## [1] "N.J"
## [1] "T.B"
## [1] "EDM"
## [1] "CGY"
## [1] "OTT"
## [1] "ANA"
## [1] "PIT"
## [1] "MTL"
## [1] "CAR"
## [1] "CBJ"
## [1] "PHI"
## [1] "T.B"
## [1] "BOS"
## [1] "DAL"
## [1] "TOR"
## [1] "T.B"
## [1] "STL" "WSH"
## [1] "FLA" "TOR"
## [1] "CAR"
## [1] "FLA" "NYR"
## [1] "PHI"
## [1] "NYI"
## [1] "DET"
## [1] "DAL"
## [1] "EDM"
## [1] "BOS"
## [1] "VAN"
## [1] "OTT"
## [1] "DAL"
## [1] "MIN"
## [1] "VAN"
## [1] "OTT"
## [1] "CBJ"
## [1] "WPG"
## [1] "DET"
## [1] "PIT"
## [1] "BOS"
## [1] "S.J"
## [1] "BUF"
## [1] "PHI"
## [1] "L.A"
## [1] "ARI"
## [1] "L.A"
## [1] "CGY"
## [1] "S.J"
## [1] "WPG"
## [1] "NSH"
## [1] "MIN"
## [1] "COL"
## [1] "CAR"
## [1] "ANA"
## [1] "VAN"
## [1] "DET"
## [1] "ANA"
## [1] "MIN"
## [1] "ANA"
## [1] "BOS"
## [1] "DAL"
## [1] "WSH"
## [1] "NSH"
## [1] "COL"
## [1] "OTT"
## [1] "CBJ"
## [1] "DAL"
## [1] "PHI"
## [1] "CBJ"
## [1] "CGY"
## [1] "CGY"
## [1] "VAN"
## [1] "BOS"
## [1] "CHI"
## [1] "WSH"
## [1] "N.J"
## [1] "COL" "NSH"
## [1] "CGY"
## [1] "OTT"
## [1] "NYR"
## [1] "N.J"
## [1] "NYI"
## [1] "FLA"
## [1] "MIN"
## [1] "BOS"
## [1] "CBJ"
## [1] "N.J"
## [1] "ANA"
## [1] "EDM"
## [1] "S.J"
## [1] "CAR"
## [1] "MIN"
## [1] "T.B"
## [1] "BUF"
## [1] "ANA"
## [1] "MTL" "T.B"
## [1] "DET"
## [1] "MIN"
## [1] "CBJ"
## [1] "EDM"
## [1] "DET" "MTL"
## [1] "WSH"
## [1] "BUF"
## [1] "DET"
## [1] "PHI"
## [1] "NSH" "STL"
## [1] "PHI"
## [1] "ANA"
## [1] "DET"
## [1] "STL"
## [1] "WPG"
## [1] "BUF"
## [1] "DET"
## [1] "L.A"
## [1] "DET"
## [1] "S.J"
## [1] "BOS" "WPG"
## [1] "FLA"
## [1] "S.J"
## [1] "MIN"
## [1] "FLA"
## [1] "DET" "TOR"
## [1] "COL"
## [1] "WPG"
## [1] "N.J"
## [1] "S.J"
## [1] "WSH"
## [1] "T.B"
## [1] "N.J"
## [1] "ANA"
## [1] "NYR"
## [1] "DET"
## [1] "BOS"
## [1] "TOR"
## [1] "PIT"
## [1] "DET"
## [1] "CAR" "STL"
## [1] "FLA"
## [1] "L.A"
## [1] "BUF"
## [1] "CHI"
## [1] "NSH"
## [1] "CAR"
## [1] "OTT"
## [1] "MTL"
## [1] "CHI"
## [1] "ARI"
## [1] "CAR"
## [1] "CAR"
## [1] "DET"
## [1] "CBJ" "DAL"
## [1] "PHI" "PIT"
## [1] "S.J"
## [1] "NSH"
## [1] "WPG"
## [1] "ANA"
## [1] "PIT"
## [1] "T.B"
## [1] "EDM"
## [1] "CBJ"
## [1] "NYR"
## [1] "COL"
## [1] "CAR"
## [1] "BUF"
## [1] "EDM" "MTL"
## [1] "NSH"
## [1] "NYR"
## [1] "ARI" "MIN"
## [1] "ARI"
## [1] "T.B"
## [1] "PIT"
## [1] "BOS"
## [1] "OTT"
## [1] "N.J"
## [1] "OTT"
## [1] "NSH"
## [1] "STL"
## [1] "DAL"
## [1] "NYI"
## [1] "N.J"
## [1] "TOR"
## [1] "TOR"
## [1] "VAN"
## [1] "OTT" "S.J"
## [1] "BOS"
## [1] "T.B"
## [1] "EDM"
## [1] "STL"
## [1] "ARI"
## [1] "BUF"
## [1] "NSH"
## [1] "MTL"
## [1] "ARI" "NYR"
## [1] "N.J" "NSH"
## [1] "DAL"
## [1] "ANA"
## [1] "MTL"
## [1] "ANA"
## [1] "DAL"
## [1] "TOR"
## [1] "L.A"
## [1] "TOR"
## [1] "PIT"
## [1] "PHI"
## [1] "BOS"
## [1] "DAL"
## [1] "NSH"
## [1] "S.J"
## [1] "ARI"
## [1] "PIT"
## [1] "NSH"
## [1] "PHI"
## [1] "NSH"
## [1] "PIT"
## [1] "TOR"
## [1] "DAL"
## [1] "CBJ"
## [1] "VAN"
## [1] "MTL"
## [1] "CHI"
## [1] "NYI"
## [1] "ARI"
## [1] "L.A"
## [1] "BUF"
## [1] "OTT"
## [1] "ARI"
## [1] "DAL"
## [1] "L.A"
## [1] "WSH"
## [1] "CAR"
## [1] "CHI"
## [1] "NYR"
## [1] "EDM"
## [1] "MTL"
## [1] "VAN"
## [1] "WSH"
## [1] "BUF"
## [1] "ANA"
## [1] "COL"
## [1] "DAL"
## [1] "FLA"
## [1] "STL"
## [1] "FLA" "NYR"
## [1] "FLA"
## [1] "TOR"
## [1] "N.J"
## [1] "NYI"
## [1] "PIT"
## [1] "L.A"
## [1] "ARI"
## [1] "NSH"
## [1] "PIT"
## [1] "CAR" "PIT"
## [1] "COL" "MTL"
## [1] "ANA"
## [1] "BUF"
## [1] "NSH"
## [1] "OTT"
## [1] "NSH"
## [1] "CHI"
## [1] "STL"
## [1] "MTL"
## [1] "ARI"
## [1] "MIN"
## [1] "PIT"
## [1] "BUF"
## [1] "WSH"
## [1] "CAR"
## [1] "T.B"
## [1] "PHI"
## [1] "BUF"
## [1] "N.J"
## [1] "WPG"
## [1] "CBJ"
## [1] "MTL"
## [1] "ARI" "TOR"
## [1] "DAL" "MTL"
## [1] "NYI"
## [1] "BUF"
## [1] "MIN"
## [1] "VAN"
## [1] "DET" "FLA"
## [1] "PIT"
## [1] "BOS"
## [1] "TOR"
## [1] "CBJ" "N.J"
## [1] "COL" "TOR"
## [1] "N.J"
## [1] "VAN"
## [1] "VAN"
## [1] "PIT"
## [1] "CBJ"
## [1] "PHI"
## [1] "BOS"
## [1] "PIT"
## [1] "ANA"
## [1] "BOS"
## [1] "FLA"
## [1] "STL"
## [1] "STL"
## [1] "OTT"
## [1] "BUF"
## [1] "COL"
## [1] "ANA"
## [1] "COL"
## [1] "EDM"
## [1] "OTT"
## [1] "NSH"
## [1] "CGY"
## [1] "N.J" "NSH"
## [1] "BUF"
## [1] "NYI"
## [1] "CGY"
## [1] "FLA"
## [1] "S.J"
## [1] "ANA" "N.J"
## [1] "WPG"
## [1] "PIT"
## [1] "TOR"
## [1] "N.J"
## [1] "CAR"
## [1] "FLA"
## [1] "NYR"
## [1] "T.B" "TOR"
## [1] "FLA"
## [1] "ANA" "FLA"
## [1] "CGY" "OTT"
## [1] "NSH"
## [1] "CBJ"
## [1] "N.J"
## [1] "MTL"
## [1] "STL"
## [1] "T.B"
## [1] "WSH"
## [1] "FLA"
## [1] "NYR"
## [1] "VAN"
## [1] "CHI"
## [1] "NYI"
## [1] "N.J"
## [1] "TOR"
## [1] "PIT" "TOR"
## [1] "COL"
## [1] "NYR"
## [1] "NYI"
## [1] "MTL"
## [1] "NSH"
## [1] "PHI"
## [1] "DET"
## [1] "N.J"
## [1] "PHI"
## [1] "BOS"
## [1] "COL"
## [1] "CAR"
## [1] "MTL"
## [1] "T.B"
## [1] "WPG"
## [1] "WSH"
## [1] "CBJ"
## [1] "VAN"
## [1] "ARI"
## [1] "CGY"
## [1] "WSH"
## [1] "NYR"
## [1] "EDM"
## [1] "OTT"
## [1] "EDM"
## [1] "T.B"
## [1] "MIN"
## [1] "DET"
## [1] "L.A"
## [1] "COL"
## [1] "BOS"
## [1] "ARI"
## [1] "CBJ"
## [1] "VAN"
## [1] "BUF"
## [1] "CGY"
## [1] "WPG"
## [1] "MTL"
## [1] "CGY"
## [1] "STL"
## [1] "STL"
## [1] "MTL"
## [1] "COL"
## [1] "MIN"
## [1] "WPG"
## [1] "PHI" "T.B"
## [1] "CAR" "OTT"
## [1] "STL"
## [1] "DAL"
## [1] "WSH"
## [1] "BUF"
## [1] "NYI"
## [1] "MIN"
## [1] "S.J"
## [1] "DAL"
## [1] "NYI"
## [1] "EDM"
## [1] "NYR"
## [1] "NSH"
## [1] "MTL"
## [1] "T.B"
## [1] "STL"
## [1] "T.B"
## [1] "T.B"
## [1] "STL"
## [1] "PHI"
## [1] "NYR"
## [1] "VAN"
## [1] "WPG"
## [1] "NYR"
## [1] "WSH"
## [1] "N.J"
## [1] "STL"
## [1] "CHI"
## [1] "T.B"
## [1] "CAR"
## [1] "OTT"
## [1] "PHI"
## [1] "PIT"
## [1] "VAN"
## [1] "VAN"
## [1] "WPG"
## [1] "N.J"
## [1] "FLA"
## [1] "CGY"
## [1] "CAR"
## [1] "STL"
## [1] "NSH"
## [1] "ARI"
## [1] "ANA"
## [1] "NYI"
## [1] "ANA"
## [1] "OTT"
## [1] "PIT"
## [1] "CAR"
## [1] "NYI"
## [1] "NSH"
## [1] "NSH"
## [1] "WPG"
## [1] "CBJ"
## [1] "NSH"
## [1] "DET"
## [1] "NYI"
## [1] "S.J"
## [1] "NYI"
## [1] "VAN"
## [1] "L.A"
## [1] "MIN"
## [1] "S.J"
## [1] "MTL"
## [1] "STL"
## [1] "L.A"
## [1] "WSH"
## [1] "EDM"
## [1] "MTL"
## [1] "NYR"
## [1] "L.A"
## [1] "ARI" "WPG"
## [1] "DAL"
## [1] "OTT"
## [1] "NSH"
## [1] "ARI"
## [1] "N.J"
## [1] "MTL"
## [1] "PHI"
## [1] "L.A"
## [1] "STL" "WSH"
## [1] "CGY"
## [1] "PIT"
## [1] "NYI"
## [1] "N.J"
## [1] "DAL"
## [1] "COL" "MTL"
## [1] "BUF"
## [1] "EDM"
## [1] "FLA"
## [1] "COL"
## [1] "BOS"
## [1] "WSH"
## [1] "NYR"
## [1] "S.J"
## [1] "NYR"
## [1] "DAL"
## [1] "WPG"
## [1] "TOR"
## [1] "VAN"
## [1] "BUF"
## [1] "ANA"
## [1] "NSH"
## [1] "N.J"
## [1] "NSH"
## [1] "DET"
## [1] "BOS"
## [1] "DET"
## [1] "EDM"
## [1] "STL"
## [1] "L.A"
## [1] "DET" "NYR"
## [1] "CHI"
## [1] "WSH"
## [1] "ARI"
## [1] "CHI"
## [1] "STL"
## [1] "CBJ"
## [1] "PIT"
## [1] "ARI"
## [1] "BOS"
## [1] "MTL"
## [1] "S.J"
## [1] "COL" "L.A"
## [1] "CHI"
## [1] "MIN"
## [1] "T.B"
## [1] "CHI"
## [1] "T.B"
## [1] "CBJ"
## [1] "DAL"
## [1] "T.B"
## [1] "PIT"
## [1] "FLA" "T.B"
## [1] "ANA"
## [1] "OTT"
## [1] "PHI"
## [1] "MIN"
## [1] "EDM"
## [1] "WSH"
## [1] "NYI"
## [1] "S.J"
## [1] "DAL"
## [1] "DET"
## [1] "BUF"
## [1] "BUF" "NSH"
## [1] "ARI"
## [1] "EDM"
## [1] "BUF"
## [1] "S.J"
## [1] "DET"
## [1] "OTT"
## [1] "OTT"
## [1] "NYR"
## [1] "MTL"
## [1] "BUF"
## [1] "ANA"
## [1] "STL"
## [1] "DET"
## [1] "PIT"
## [1] "BOS"
## [1] "WPG"
## [1] "COL"
## [1] "NYR"
## [1] "EDM"
## [1] "DAL"
## [1] "NYR"
## [1] "L.A"
## [1] "STL"
## [1] "DAL"
## [1] "ANA"
## [1] "CAR"
## [1] "TOR"
## [1] "WSH"
## [1] "CGY"
## [1] "OTT"
## [1] "BUF"
## [1] "N.J"
## [1] "NYI"
## [1] "COL"
## [1] "TOR"
## [1] "CHI"
## [1] "ARI"
## [1] "WSH"
## [1] "L.A"
## [1] "T.B" "TOR"
## [1] "EDM"
## [1] "PHI"
## [1] "MTL"
## [1] "MTL"
## [1] "COL"
## [1] "ANA"
## [1] "CHI"
## [1] "MIN"
## [1] "OTT"
## [1] "STL"
## [1] "NYR" "OTT"
## [1] "EDM" "MTL"
## [1] "VAN"
## [1] "L.A"
## [1] "CAR"
## [1] "CGY"
## [1] "MTL"
## [1] "ANA"
## [1] "COL"
## [1] "COL"
## [1] "PIT"
## [1] "CGY"
## [1] "VAN"
## [1] "ANA"
## [1] "OTT"
## [1] "T.B"
## [1] "CGY" "OTT"
## [1] "WPG"
## [1] "NSH"
## [1] "TOR"
## [1] "PIT"
## [1] "S.J"
## [1] "COL"
## [1] "ARI"
## [1] "EDM"
## [1] "MIN"
## [1] "NYI"
## [1] "TOR"
## [1] "COL"
## [1] "DAL"
## [1] "CHI" "DET"
## [1] "FLA"
## [1] "EDM"
## [1] "CGY"
## [1] "PIT"
## [1] "ANA"
## [1] "EDM"
## [1] "EDM"
## [1] "N.J"
## [1] "VAN"
## [1] "WPG"
## [1] "S.J"
## [1] "L.A"
## [1] "WPG"
## [1] "NYR"
## [1] "WPG"
## [1] "STL"
## [1] "WPG"
## [1] "NYR"
## [1] "FLA"
## [1] "ARI" "MIN"
## [1] "T.B"
## [1] "BOS"
## [1] "S.J" "VAN"
## [1] "ARI"
## [1] "COL"
## [1] "WPG"
## [1] "DET"
## [1] "BUF"
## [1] "FLA"
## [1] "BUF"
## [1] "FLA"
## [1] "DET"
## [1] "DAL"
## [1] "COL"
## [1] "CBJ" "N.J"
## [1] "DET"
## [1] "CGY"
## [1] "NSH"
## [1] "PIT" "TOR"
## [1] "EDM"
## [1] "MIN"
## [1] "ANA"
## [1] "CHI"
## [1] "T.B"
## [1] "CHI" "DAL"
## [1] "MTL"
## [1] "NYR"
## [1] "DET"
## [1] "STL"
## [1] "L.A"
## [1] "S.J" "VAN"
## [1] "MIN"
## [1] "NYR"
## [1] "OTT"
## [1] "TOR"
## [1] "MIN"
## [1] "ANA" "DAL"
## [1] "BUF"
## [1] "BOS"
## [1] "ARI"
## [1] "N.J"
## [1] "WSH"
## [1] "T.B"
## [1] "BUF"
## [1] "CAR"
## [1] "NYI"
## [1] "PHI"
## [1] "CGY"
## [1] "NYI"
## [1] "NYR"
## [1] "WSH"
## [1] "WPG"
## [1] "NYI"
## [1] "OTT"
## [1] "OTT"
## [1] "CBJ"
## [1] "CAR"
## [1] "MIN"
## [1] "FLA"
## [1] "VAN"
## [1] "VAN"
## [1] "CBJ"
## [1] "S.J"
## [1] "WPG"
## [1] "WPG"
## [1] "CHI"
## [1] "PHI"
## [1] "ANA"
## [1] "WSH"
## [1] "PIT"
## [1] "DET"
## [1] "CAR"
## [1] "ARI"
## [1] "PIT"
## [1] "VAN"
## [1] "CBJ"
## [1] "VAN"
## [1] "CGY"
## [1] "CHI"
## [1] "OTT"
print(head(all_data))
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV.
## 1 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0 0.750
## 2 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4 0.915
## 3 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7 0.934
## 4 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2 0.897
## 5 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8 0.917
## 6 79-05-23 Strathroy ON CAN CAN 70 192 1997 6 156 L Campbell Brian D CHI 80 5 12 6 6 17 12 0.7 24 1896 88462 1473.7 18.43 18.43 32.4 23.9 9.8 0.936
## PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0 0
## 2 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1 0
## 3 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54 45
## 4 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0 0
## 5 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3 7
## 6 1033 2.89 1.91 60.2 24 0.98 171 171 110 75 74 75 4.5 7 2 3 5 46.1 41.9 95.7 43 43 157 -114 35 59 11 83 59 11 83 6.5 0 0 0 0
## FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2 51
## 3 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4 51
## 4 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 15
## 5 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1 20
## 6 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 2 0 1 6 28
## S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF
## 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4 5 1
## 2 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949 939 749
## 3 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739 600 340
## 4 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324 328 198
## 5 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528 490 512
## 6 2 0 32 9 1 0 30 12 11 12 8 -1 5.4 12 0 0 0 0 1356 1281 971 972 728 730 62.9 59.9 210 197 71 47 30 56 58 85 88 141 570 667 348
## HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH
## 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 6 707 223 168 76 60 0.6 3.7 4.3 3069.81 150 4.5 107.4 20.8 0.26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
Splitting of the categorical position column into multiple booleans.
# Make position boolean columns
pos = c()
for (i in levels(all_data$Position)) {
x = strsplit(i, "/")
for (y in x) {
pos = c(pos, y)
}
}
pos = unique(pos)
print(pos)
## [1] "C" "D" "LW" "RW"
# add columns with the pos names as the header and 0 as
# values
for (position in pos) {
all_data[, position] = 0
}
# iterate through and record the position(s) for each player
for (i in 1:length(all_data$Position)) {
pos_of_person = strsplit(as.character(all_data$Position[i]),
"/")[[1]]
for (x in pos_of_person) {
all_data[, x][i] = 1
}
}
print(head(all_data))
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV.
## 1 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0 0.750
## 2 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4 0.915
## 3 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7 0.934
## 4 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2 0.897
## 5 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8 0.917
## 6 79-05-23 Strathroy ON CAN CAN 70 192 1997 6 156 L Campbell Brian D CHI 80 5 12 6 6 17 12 0.7 24 1896 88462 1473.7 18.43 18.43 32.4 23.9 9.8 0.936
## PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0 0
## 2 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1 0
## 3 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54 45
## 4 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0 0
## 5 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3 7
## 6 1033 2.89 1.91 60.2 24 0.98 171 171 110 75 74 75 4.5 7 2 3 5 46.1 41.9 95.7 43 43 157 -114 35 59 11 83 59 11 83 6.5 0 0 0 0
## FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2 51
## 3 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4 51
## 4 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 15
## 5 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1 20
## 6 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 2 0 1 6 28
## S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF
## 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4 5 1
## 2 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949 939 749
## 3 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739 600 340
## 4 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324 328 198
## 5 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528 490 512
## 6 2 0 32 9 1 0 30 12 11 12 8 -1 5.4 12 0 0 0 0 1356 1281 971 972 728 730 62.9 59.9 210 197 71 47 30 56 58 85 88 141 570 667 348
## HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 5 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 6 707 223 168 76 60 0.6 3.7 4.3 3069.81 150 4.5 107.4 20.8 0.26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## LW RW
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 0
We need to manipulate the date of birth into three columns: year of birth, month of birth, and day of birth.
# turn the born column into an age column 3 integer columns
# year:month:date
library(stringr)
# Objective: standardize year, month and day and create
# separate columns for each.
bday_parts = str_split_fixed(all_data$Born, "-", 3)
# adjust year column to account for missing digits
birth_year = c() # A list created for storing players' year of birth
for (year in bday_parts[, 1]) {
# Read from the first columns of bday_parts It is two digit
# year, so payers born in 21st centry must be younger
if (as.numeric(year) < 10) {
yr = paste("20", year, sep = "") # Players born in 21st century
birth_year = c(birth_year, yr) # Store the new values in birth_year
} else {
yr = paste("19", year, sep = "") # If player are not born in 21st century append 19 before the year.
birth_year = c(birth_year, yr)
}
}
all_data$birth_year <- as.numeric(birth_year) # Create separate column for YEAR & add to all_data
all_data$birth_month <- as.numeric(bday_parts[, 2]) # Create separate column for MONTH & add to all_data
all_data$birth_day <- as.numeric(bday_parts[, 3]) # Create separate column for DAY & add to all_data
head(all_data)
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV.
## 1 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0 0.750
## 2 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4 0.915
## 3 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7 0.934
## 4 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2 0.897
## 5 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8 0.917
## 6 79-05-23 Strathroy ON CAN CAN 70 192 1997 6 156 L Campbell Brian D CHI 80 5 12 6 6 17 12 0.7 24 1896 88462 1473.7 18.43 18.43 32.4 23.9 9.8 0.936
## PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0 0
## 2 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1 0
## 3 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54 45
## 4 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0 0
## 5 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3 7
## 6 1033 2.89 1.91 60.2 24 0.98 171 171 110 75 74 75 4.5 7 2 3 5 46.1 41.9 95.7 43 43 157 -114 35 59 11 83 59 11 83 6.5 0 0 0 0
## FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2 51
## 3 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4 51
## 4 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 15
## 5 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1 20
## 6 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 2 0 1 6 28
## S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF
## 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4 5 1
## 2 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949 939 749
## 3 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739 600 340
## 4 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324 328 198
## 5 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528 490 512
## 6 2 0 32 9 1 0 30 12 11 12 8 -1 5.4 12 0 0 0 0 1356 1281 971 972 728 730 62.9 59.9 210 197 71 47 30 56 58 85 88 141 570 667 348
## HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 5 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 6 707 223 168 76 60 0.6 3.7 4.3 3069.81 150 4.5 107.4 20.8 0.26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## LW RW birth_year birth_month birth_day
## 1 0 0 1997 1 30
## 2 0 0 1993 12 21
## 3 0 1 1988 4 16
## 4 0 0 1992 1 7
## 5 0 1 1994 3 29
## 6 0 0 1979 5 23
# split Cntry and Nat to boolean columns
birth_country = levels(all_data$Cntry)
# add columns with the country of birth options note the
# Estonia for Uncle Leo
for (country in birth_country) {
c = paste("born", country, sep = "_")
all_data[, c] = 0
}
# iterate through and record the birth country of each player
for (i in 1:length(all_data$Cntry)) {
birth_country = all_data$Cntry[i]
c = paste("born", birth_country, sep = "_")
all_data[, c][i] = 1
}
nationality = levels(all_data$Nat)
for (country in nationality) {
c = paste("nation", country, sep = "_")
all_data[, c] = 0
}
# iterate through and record the birth country of each player
for (i in 1:length(all_data$Nat)) {
nationality = all_data$Nat[i]
c = paste("nation", nationality, sep = "_")
all_data[, c][i] = 1
}
head(all_data)
## Born City Pr.St Cntry Nat Ht Wt DftYr DftRd Ovrl Hand Last.Name First.Name Position Team GP G A A1 A2 PTS X... E... PIM Shifts TOI TOIX TOI.GP TOI.GP.1 TOI. IPP. SH. SV.
## 1 97-01-30 Sainte-Marie QC CAN CAN 74 190 2015 1 18 L Chabot Thomas D OTT 1 0 0 0 0 0 -2 0.0 0 13 429 7.2 7.15 7.16 15.2 0.0 0.0 0.750
## 2 93-12-21 Ottawa ON CAN CAN 74 207 2012 1 15 R Ceci Cody D OTT 79 2 15 6 9 17 -11 -10.4 20 2418 109992 1826.2 23.20 23.17 39.0 30.4 7.4 0.915
## 3 88-04-16 St. Paul MN USA USA 72 218 2006 1 7 R Okposo Kyle RW BUF 65 19 26 13 13 45 -7 -1.4 24 1443 73983 1229.2 18.97 18.95 33.1 63.4 9.7 0.934
## 4 92-01-07 Ottawa ON CAN CAN 77 220 2010 1 3 R Gudbranson Erik D VAN 30 1 5 5 0 6 -14 -5.3 18 765 36603 607.9 20.33 20.31 36.1 37.5 6.2 0.897
## 5 94-03-29 Toronto ON CAN CAN 76 217 2012 1 16 R Wilson Tom RW WSH 82 7 12 4 8 19 9 4.1 133 1453 63592 1059.7 12.93 12.93 23.5 61.3 7.8 0.917
## 6 79-05-23 Strathroy ON CAN CAN 70 192 1997 6 156 L Campbell Brian D CHI 80 5 12 6 6 17 12 0.7 24 1896 88462 1473.7 18.43 18.43 32.4 23.9 9.8 0.936
## PDO F.60 A.60 Pct. Diff Diff.60 iCF iCF.1 iFF iSF iSF.1 iSF.2 ixG iSCF iRB iRS iDS sDist sDist.1 Pass iHF iHF.1 iHA iHDf iMiss iGVA iTKA iBLK iGVA.1 iTKA.1 iBLK.1 BLK. iFOW iFOL iFOW.1 iFOL.1
## 1 750 0.00 16.74 0.0 -2 -16.74 2 2 2 1 1 1 0.0 0 0 0 0 43.0 49.3 0.0 1 1 0 1 1 1 0 0 1 0 0 0.0 0 0 0 0
## 2 989 1.84 2.79 39.7 -29 -0.95 287 287 197 143 143 143 6.1 7 7 9 16 52.4 46.3 138.1 111 111 154 -43 54 74 22 159 74 22 159 8.0 1 0 1 0
## 3 1031 3.47 1.95 64.0 31 1.51 283 283 212 155 156 156 17.4 64 16 20 36 28.4 26.3 196.8 53 53 68 -15 57 36 26 25 36 26 25 2.4 54 45 54 45
## 4 959 1.58 3.45 31.4 -19 -1.88 88 88 55 40 40 40 1.4 2 1 4 5 55.1 51.0 153.0 66 66 66 0 15 23 4 44 23 4 44 7.3 0 0 0 0
## 5 995 1.76 2.32 43.1 -10 -0.57 166 166 118 95 95 95 9.3 35 8 10 18 30.9 26.4 96.3 239 239 134 105 23 21 36 44 21 36 44 4.4 3 7 3 7
## 6 1033 2.89 1.91 60.2 24 0.98 171 171 110 75 74 75 4.5 7 2 3 5 46.1 41.9 95.7 43 43 157 -114 35 59 11 83 59 11 83 6.5 0 0 0 0
## FO. X.FOT dzFOW dzFOL nzFOW nzFOL ozFOW ozFOL FOW.Up FOL.Up FOW.Down FOL.Down FOW.Close FOL.Close OTG X1G GWG ENG PSG PSA G.Bkhd G.Dflct G.Slap G.Snap G.Tip G.Wrap G.Wrst CBar Post Over Wide
## 1 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 100.0 0.1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 2 51
## 3 54.5 7.4 9 6 10 11 35 28 13 10 21 16 37 33 1 5 2 0 0 0 5 2 0 3 0 0 9 0 2 4 51
## 4 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 15
## 5 30.0 1.0 1 1 0 2 2 4 3 1 0 3 2 6 0 2 0 1 0 0 0 0 0 0 1 0 6 0 2 1 20
## 6 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 2 0 1 6 28
## S.Bkhd S.Dflct S.Slap S.Snap S.Tip S.Wrap S.Wrst iPenT iPenD iPENT iPEND iPenDf NPD Min Maj Match Misc Game CF CA FF FA SF SA xGF xGA SCF SCA GF GA RBF RBA RSF RSA DSF DSA FOW FOL HF
## 1 0 0 1 0 0 0 0 0 0 0 0 0 0.0 0 0 0 0 0 9 12 8 10 5 8 0.5 0.9 2 3 0 2 1 1 0 1 1 2 4 5 1
## 2 2 0 49 12 0 1 79 10 6 10 5 -4 2.2 10 0 0 0 0 1433 1992 1038 1423 757 997 62.0 88.8 197 280 56 85 68 82 79 94 147 176 949 939 749
## 3 19 3 3 20 8 2 101 12 10 11 8 -2 -0.5 12 0 0 0 0 1301 1051 986 826 734 606 70.8 46.4 235 133 71 40 60 34 76 52 136 86 739 600 340
## 4 0 0 18 3 0 0 19 6 7 6 6 1 2.7 4 2 0 0 0 460 605 339 467 259 340 22.0 33.6 80 130 16 35 27 20 29 32 56 52 324 328 198
## 5 7 2 3 10 11 1 61 44 33 40 29 -11 -14.3 33 9 0 1 1 766 992 546 720 398 495 33.5 47.5 124 159 31 41 30 37 43 53 73 90 528 490 512
## 6 2 0 32 9 1 0 30 12 11 12 8 -1 5.4 12 0 0 0 0 1356 1281 971 972 728 730 62.9 59.9 210 197 71 47 30 56 58 85 88 141 570 667 348
## HA GVA TKA PENT PEND OPS DPS PS OTOI Grit DAP Pace GS GS.G ANA FLA N.J VAN ARI CGY MIN NYR TOR BOS WPG BUF CAR OTT PIT STL CBJ DAL CHI COL MTL NSH S.J DET EDM L.A T.B NYI PHI WSH C D
## 1 2 1 1 1 1 0.0 -0.2 -0.2 40.03 1 0.0 175.7 -0.4 -0.38 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2 671 284 197 104 98 -0.2 3.4 3.2 2850.59 290 13.3 112.5 14.1 0.18 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 351 168 129 56 70 3.7 1.3 5.0 2486.75 102 6.6 114.8 36.8 0.57 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 197 86 59 26 22 0.0 0.4 0.5 1074.41 130 17.5 105.1 5.9 0.20 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 5 422 157 126 88 68 -0.1 1.4 1.3 3459.09 425 8.3 99.5 21.8 0.27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 6 707 223 168 76 60 0.6 3.7 4.3 3069.81 150 4.5 107.4 20.8 0.26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## LW RW birth_year birth_month birth_day born_AUT born_CAN born_CHE born_CZE born_DEU born_DNK born_EST born_FIN born_FRA born_GBR born_HRV born_ITA born_LVA born_NOR born_RUS born_SVK born_SWE
## 1 0 0 1997 1 30 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 1993 12 21 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 1988 4 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 1992 1 7 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 1 1994 3 29 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 1979 5 23 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## born_USA born_SVN nation_AUT nation_CAN nation_CHE nation_CZE nation_DEU nation_DNK nation_FIN nation_FRA nation_GBR nation_HRV nation_LVA nation_NOR nation_RUS nation_SVK nation_SWE nation_USA
## 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## nation_SVN
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
all_data$undrafted = is.na(all_data$DftRd)
# fill median values loop through the dataframe, filling each
# column with the median of the existing values for the
# entire dataset where are there still missing values?
all_missing_list = colnames(all_data)[colSums(is.na(all_data)) >
0]
length(all_missing_list) == 0 # Flag to check NA values
## [1] FALSE
# if above true all values are imputed!
for (i in 1:length(all_missing_list)) {
# get the global median
median_all <- median(all_data[, all_missing_list[i]], na.rm = TRUE) # Neglect NA when calculating #+ median
print(median_all)
# imput the missing values with the column's median
all_data[, all_missing_list[i]][is.na(all_data[, all_missing_list[i]])] <- median_all
}
## [1] 2010
## [1] 2
## [1] 47
## [1] 866.5
## [1] 26.9
## [1] 54.5
## [1] 8.1
## [1] 0.917
## [1] 999
## [1] 2.3
## [1] 2.43
## [1] 0
## [1] 137
## [1] 105
## [1] 74
## [1] 5.3
## [1] 12
## [1] 4
## [1] 6
## [1] 10
## [1] 29.2
## [1] 109.4
## [1] 49
## [1] 59
## [1] -1
## [1] 19
## [1] 15
## [1] 27
## [1] 4.3
## [1] 2
## [1] 2
## [1] 0.9
## [1] 9
## [1] 6
## [1] 749
## [1] 790
## [1] 552
## [1] 591
## [1] 397
## [1] 425
## [1] 33.4
## [1] 36.1
## [1] 111
## [1] 117
## [1] 30
## [1] 35
## [1] 25
## [1] 27
## [1] 35
## [1] 36
## [1] 387
## [1] 397
## [1] 318
## [1] 319
## [1] 116
## [1] 85
## [1] 51
## [1] 45
## [1] 2436.61
## [1] 109.2
## [1] 15.7
## [1] 0.31
all_missing_list <- colnames(all_data)[colSums(is.na(all_data))]
length(all_missing_list) == 0 # Flag to check NA values
## [1] TRUE
How many players from each country?
barplot(sort(table(all_data$Nat), decreasing = TRUE), horiz = TRUE,
las = 1, col = c("red", "blue4", "blue", "red3", "skyblue"),
main = "Number of NHL players from each country", ylab = "Country",
xlab = "count")
table(all_data$birth_year)
##
## 1972 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
## 1 2 2 6 11 12 9 21 22 27 29 43 44 59 63 59 77 71 77 74 51 50 34 15 11 4
hist(all_data$birth_year, breaks = 28, col = "skyblue", xlab = "Year of birth",
main = "Distribution of NHL players by birth year (2016/2017 season)\nA.K.A. Jaromir Jagr the ageless one")
summary(train.df$Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575000 742500 925000 2264509 3500000 13800000
hist(train.df$Salary, breaks = 52, col = "salmon", xlab = "Salary",
ylab = "Number of players", main = "NHL Salary Distribution: 2016/2017")
plot(train.df$G, train.df$Salary, xlab = "No. Goals", pch = 20,
ylab = "Money Earned")
abline(lm(train.df$Salary ~ train.df$G), col = "blue")
plot(train.df$G, train.df$Salary, pch = 20, xlab = "goals scored",
ylab = "money earned", main = "Who are the outliers?")
abline(lm(train.df$Salary ~ train.df$G), col = "red")
text(train.df$G, train.df$Salary, labels = train.df$Last.Name,
cex = 0.7, pos = 3)
# train.df2$Salary <- train.df$Salary train.df3 <-
# cbind(train.df[,1], all_data[1:612,])
train.df2 <- all_data[c(1:612), ]
train.final <- all_data[c(1:612), ]
train.final$Salary <- train.df$Salary
test.final <- all_data[c(613:874), ]
Before ploting 3D plot, let us preapre a new dataset from the all_data to have a clean train data set.
pairs(~Born + +Ht + Wt + DftRd + Position + Team + GP + G, data = train.df2,
main = "Simple Scatterplot Matrix")
str(train.final)
## 'data.frame': 612 obs. of 228 variables:
## $ Born : Factor w/ 806 levels "72-02-15","77-03-18",..: 555 435 169 326 458 8 273 423 357 410 ...
## $ City : Factor w/ 490 levels "Aalborg","Albany",..: 269 225 305 225 324 312 310 324 277 235 ...
## $ Pr.St : Factor w/ 39 levels "INT","AB","AK",..: 32 29 17 29 29 29 1 29 29 29 ...
## $ Cntry : Factor w/ 19 levels "AUT","CAN","CHE",..: 2 2 18 2 2 2 17 2 2 2 ...
## $ Nat : Factor w/ 17 levels "AUT","CAN","CHE",..: 2 2 16 2 2 2 15 2 2 2 ...
## $ Ht : int 74 74 72 77 76 70 71 70 72 68 ...
## $ Wt : int 190 207 218 220 217 192 185 183 214 178 ...
## $ DftYr : int 2015 2012 2006 2010 2012 1997 2009 2010 2010 2011 ...
## $ DftRd : int 1 1 1 1 1 6 2 2 2 7 ...
## $ Ovrl : int 18 15 7 3 16 156 53 47 42 201 ...
## $ Hand : Factor w/ 2 levels "L","R": 1 2 2 2 2 1 1 2 2 1 ...
## $ Last.Name : Factor w/ 801 levels "Abdelkader","Aberg",..: 78 76 373 192 562 67 434 435 483 390 ...
## $ First.Name : Factor w/ 385 levels "A.J.","Aaron",..: 276 55 163 95 278 39 18 96 79 187 ...
## $ Position : Factor w/ 20 levels "C","C/D","C/LW",..: 7 7 14 7 14 7 14 10 16 1 ...
## $ Team : Factor w/ 78 levels "ANA","ANA/FLA",..: 52 52 12 66 68 22 66 12 45 63 ...
## $ GP : int 1 79 65 30 82 80 3 30 53 10 ...
## $ G : int 0 2 19 1 7 5 0 4 4 1 ...
## $ A : int 0 15 26 5 12 12 1 2 5 1 ...
## $ A1 : int 0 6 13 5 4 6 0 2 2 1 ...
## $ A2 : int 0 9 13 0 8 6 1 0 3 0 ...
## $ PTS : int 0 17 45 6 19 17 1 6 9 2 ...
## $ X... : int -2 -11 -7 -14 9 12 1 -7 -19 -3 ...
## $ E... : num 0 -10.4 -1.4 -5.3 4.1 0.7 0.3 -4.1 -7.9 -2.6 ...
## $ PIM : int 0 20 24 18 133 24 0 4 12 2 ...
## $ Shifts : int 13 2418 1443 765 1453 1896 40 506 975 159 ...
## $ TOI : int 429 109992 73983 36603 63592 88462 1604 23265 42950 6867 ...
## $ TOIX : num 7.2 1826.2 1229.2 607.9 1059.7 ...
## $ TOI.GP : num 7.15 23.2 18.97 20.33 12.93 ...
## $ TOI.GP.1 : num 7.16 23.17 18.95 20.31 12.93 ...
## $ TOI. : num 15.2 39 33.1 36.1 23.5 32.4 19.1 23.5 24.2 23.4 ...
## $ IPP. : num 0 30.4 63.4 37.5 61.3 23.9 100 75 81.8 100 ...
## $ SH. : num 0 7.4 9.7 6.2 7.8 9.8 14.3 4.7 4 4.4 ...
## $ SV. : num 0.75 0.915 0.934 0.897 0.917 0.936 1 0.925 0.924 0.907 ...
## $ PDO : num 750 989 1031 959 995 ...
## $ F.60 : num 0 1.84 3.47 1.58 1.76 2.89 2.24 1.24 0.92 1.05 ...
## $ A.60 : num 16.74 2.79 1.95 3.45 2.32 ...
## $ Pct. : num 0 39.7 64 31.4 43.1 60.2 100 32 27.5 28.6 ...
## $ Diff : int -2 -29 31 -19 -10 24 1 -9 -18 -3 ...
## $ Diff.60 : num -16.74 -0.95 1.51 -1.88 -0.57 ...
## $ iCF : int 2 287 283 88 166 171 5 94 108 24 ...
## $ iCF.1 : int 2 287 283 88 166 171 5 94 109 24 ...
## $ iFF : int 2 197 212 55 118 110 3 74 97 17 ...
## $ iSF : int 1 143 155 40 95 75 2 51 74 9 ...
## $ iSF.1 : int 1 143 156 40 95 74 2 51 76 9 ...
## $ iSF.2 : int 1 143 156 40 95 75 2 51 76 9 ...
## $ ixG : num 0 6.1 17.4 1.4 9.3 4.5 0.1 2.8 7.8 0.9 ...
## $ iSCF : int 0 7 64 2 35 7 0 9 31 3 ...
## $ iRB : int 0 7 16 1 8 2 0 1 4 0 ...
## $ iRS : int 0 9 20 4 10 3 0 6 1 1 ...
## $ iDS : num 0 16 36 5 18 5 0 7 5 1 ...
## $ sDist : num 43 52.4 28.4 55.1 30.9 46.1 33.5 37.7 26.6 24.9 ...
## $ sDist.1 : num 49.3 46.3 26.3 51 26.4 41.9 38.2 37.6 25.4 28.4 ...
## $ Pass : num 0 138.1 196.8 153 96.3 ...
## $ iHF : int 1 111 53 66 239 43 0 12 94 9 ...
## $ iHF.1 : int 1 111 53 66 239 43 0 12 94 9 ...
## $ iHA : int 0 154 68 66 134 157 2 20 53 16 ...
## $ iHDf : num 1 -43 -15 0 105 -114 -2 -8 41 -7 ...
## $ iMiss : int 1 54 57 15 23 35 1 23 24 8 ...
## $ iGVA : int 1 74 36 23 21 59 1 9 14 2 ...
## $ iTKA : int 0 22 26 4 36 11 2 12 17 0 ...
## $ iBLK : int 0 159 25 44 44 83 1 12 32 2 ...
## $ iGVA.1 : int 1 74 36 23 21 59 1 9 14 2 ...
## $ iTKA.1 : int 0 22 26 4 36 11 2 12 17 0 ...
## $ iBLK.1 : int 0 159 25 44 44 83 1 12 32 2 ...
## $ BLK. : num 0 8 2.4 7.3 4.4 6.5 4.3 3.2 4.4 2.2 ...
## $ iFOW : int 0 1 54 0 3 0 0 104 29 32 ...
## $ iFOL : int 0 0 45 0 7 0 0 146 42 40 ...
## $ iFOW.1 : int 0 1 54 0 3 0 0 104 29 32 ...
## $ iFOL.1 : int 0 0 45 0 7 0 0 146 42 40 ...
## $ FO. : num 0 100 54.5 0 30 0 0 41.6 40.8 44.4 ...
## $ X.FOT : num 0 0.1 7.4 0 1 0 0 79.4 11.5 74.2 ...
## $ dzFOW : int 0 1 9 0 1 0 0 27 7 8 ...
## $ dzFOL : int 0 0 6 0 1 0 0 37 9 10 ...
## $ nzFOW : int 0 0 10 0 0 0 0 37 9 13 ...
## $ nzFOL : int 0 0 11 0 2 0 0 53 12 17 ...
## $ ozFOW : int 0 0 35 0 2 0 0 40 13 11 ...
## $ ozFOL : int 0 0 28 0 4 0 0 56 21 13 ...
## $ FOW.Up : int 0 0 13 0 3 0 0 36 10 16 ...
## $ FOL.Up : int 0 0 10 0 1 0 0 46 18 14 ...
## $ FOW.Down : int 0 0 21 0 0 0 0 39 10 7 ...
## $ FOL.Down : int 0 0 16 0 3 0 0 56 11 10 ...
## $ FOW.Close : int 0 1 37 0 2 0 0 59 17 18 ...
## $ FOL.Close : int 0 0 33 0 6 0 0 90 26 23 ...
## $ OTG : int 0 0 1 0 0 0 0 0 0 0 ...
## $ X1G : int 0 0 5 0 2 1 0 1 2 0 ...
## $ GWG : int 0 0 2 1 0 1 0 0 0 0 ...
## $ ENG : int 0 0 0 0 1 0 0 0 0 0 ...
## $ PSG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PSA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ G.Bkhd : int 0 0 5 0 0 0 0 0 2 0 ...
## $ G.Dflct : int 0 0 2 0 0 0 0 0 0 0 ...
## $ G.Slap : int 0 1 0 0 0 2 0 0 0 0 ...
## $ G.Snap : int 0 0 3 0 0 0 0 1 0 0 ...
## $ G.Tip : int 0 0 0 0 1 0 0 0 2 0 ...
## $ G.Wrap : int 0 0 0 0 0 0 0 0 0 0 ...
## $ G.Wrst : int 0 1 9 1 6 2 0 3 0 1 ...
## $ CBar : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Post : int 0 1 2 0 2 1 0 0 0 0 ...
## $ Over : int 0 2 4 0 1 6 0 2 3 0 ...
## [list output truncated]
str(test.final)
## 'data.frame': 262 obs. of 227 variables:
## $ Born : Factor w/ 806 levels "72-02-15","77-03-18",..: 616 578 346 799 736 606 712 144 199 781 ...
## $ City : Factor w/ 490 levels "Aalborg","Albany",..: 421 250 303 460 450 23 428 324 437 457 ...
## $ Pr.St : Factor w/ 39 levels "INT","AB","AK",..: 26 1 18 1 26 17 1 29 2 1 ...
## $ Cntry : Factor w/ 19 levels "AUT","CAN","CHE",..: 18 4 18 8 18 18 15 2 2 17 ...
## $ Nat : Factor w/ 17 levels "AUT","CAN","CHE",..: 16 4 16 7 16 16 13 2 2 15 ...
## $ Ht : int 72 72 75 72 72 74 73 75 72 73 ...
## $ Wt : int 216 195 227 182 196 210 180 210 192 205 ...
## $ DftYr : int 2003 2014 2007 2013 2011 2002 2010 2001 2003 2012 ...
## $ DftRd : int 1 1 6 2 2 4 1 1 3 1 ...
## $ Ovrl : int 13 13 161 55 36 129 8 2 74 11 ...
## $ Hand : Factor w/ 2 levels "L","R": 2 1 1 1 2 2 1 2 1 2 ...
## $ Last.Name : Factor w/ 801 levels "Abdelkader","Aberg",..: 59 792 716 707 605 650 594 777 713 642 ...
## $ First.Name : Factor w/ 385 levels "A.J.","Aaron",..: 331 124 227 314 3 278 8 129 325 334 ...
## $ Position : Factor w/ 20 levels "C","C/D","C/LW",..: 17 9 9 9 7 7 5 1 9 11 ...
## $ Team : Factor w/ 78 levels "ANA","ANA/FLA",..: 40 68 34 43 51 40 70 28 52 48 ...
## $ GP : int 80 21 81 73 31 18 49 68 4 82 ...
## $ G : int 14 3 27 18 2 1 5 15 0 31 ...
## $ A : int 22 3 15 10 9 4 11 35 0 27 ...
## $ A1 : int 9 2 9 3 2 4 8 15 0 17 ...
## $ A2 : int 13 1 6 7 7 0 3 20 0 10 ...
## $ PTS : int 36 6 42 28 11 5 16 50 0 58 ...
## $ X... : int -4 2 13 -1 3 -4 -7 -18 -1 -4 ...
## $ E... : num 8.2 0.4 15 3 3.6 0.8 -10.6 -3.7 0.4 2.7 ...
## $ PIM : int 22 2 95 8 17 6 12 29 0 32 ...
## $ Shifts : int 1729 291 1715 1488 658 393 834 1530 69 2080 ...
## $ TOI : int 76801 13997 81345 60702 29406 16693 39266 65977 3237 91109 ...
## $ TOIX : num 1278 233 1352 1010 490 ...
## $ TOI.GP : num 16 11.1 16.7 13.9 15.8 ...
## $ TOI.GP.1 : num 16 11.1 16.7 13.8 15.8 ...
## $ TOI. : num 27.2 22 30 24.9 28.7 26.7 24.1 27.9 23.5 30.9 ...
## $ IPP. : num 65.5 66.7 54.5 63.6 47.8 83.3 66.7 66.7 0 69.9 ...
## $ SH. : num 8.5 6.7 9.5 8.5 9.3 4.6 9.3 11.6 0 9.6 ...
## $ SV. : num 0.898 0.969 0.919 0.922 0.909 0.917 0.919 0.871 0.95 0.905 ...
## $ PDO : num 982 1037 1014 1007 1002 ...
## $ F.60 : num 2.58 2.32 3.42 2.61 2.82 1.31 2.21 4.1 0 3.3 ...
## $ A.60 : num 2.77 0.77 2.09 2.26 2.08 1.96 2.58 3.55 1.11 2.75 ...
## $ Pct. : num 48.2 75 62.1 53.7 57.5 40 46.2 53.6 0 54.6 ...
## $ Diff : int -4 6 30 6 6 -3 -4 10 -1 14 ...
## $ Diff.60 : num -0.19 1.54 1.33 0.36 0.73 -0.65 -0.37 0.55 -1.11 0.56 ...
## $ iCF : int 326 56 300 279 89 30 89 285 13 467 ...
## $ iCF.1 : int 326 56 300 279 89 30 89 285 13 467 ...
## $ iFF : int 251 49 243 208 58 21 64 220 9 362 ...
## $ iSF : int 175 32 178 158 39 12 45 149 5 235 ...
## $ iSF.1 : int 175 32 178 158 38 12 45 149 5 234 ...
## $ iSF.2 : int 175 32 178 158 38 12 45 149 5 234 ...
## $ ixG : num 19.7 5.2 25.1 17 1.7 0.6 4.9 14.9 0.4 25.4 ...
## $ iSCF : int 73 23 109 58 1 2 17 51 0 76 ...
## $ iRB : int 19 3 23 13 0 0 8 9 0 18 ...
## $ iRS : int 19 2 17 20 2 1 7 12 0 30 ...
## $ iDS : num 38 5 40 33 2 1 15 21 0 48 ...
## $ sDist : num 28.2 21 21.8 28.1 59.1 47.9 35.3 29.7 34.8 31.3 ...
## $ sDist.1 : num 27 23.2 21.7 26.1 41.7 50.7 31.3 29.7 30.1 28 ...
## $ Pass : num 198.5 57.7 154.6 81.4 24.2 ...
## $ iHF : int 190 6 189 72 22 10 60 13 4 97 ...
## $ iHF.1 : int 190 6 189 72 22 10 60 13 4 97 ...
## $ iHA : int 151 11 109 113 66 34 38 43 3 88 ...
## $ iHDf : num 39 -5 80 -41 -44 -24 22 -30 1 9 ...
## $ iMiss : int 76 17 65 50 19 9 19 71 4 127 ...
## $ iGVA : int 27 9 49 20 20 7 14 49 1 62 ...
## $ iTKA : int 25 4 33 25 7 1 12 32 1 76 ...
## $ iBLK : int 31 7 23 25 27 28 19 31 1 26 ...
## $ iGVA.1 : int 27 9 49 20 20 7 14 49 1 62 ...
## $ iTKA.1 : int 25 4 33 25 7 1 12 32 1 76 ...
## $ iBLK.1 : int 31 7 23 25 27 28 19 31 1 26 ...
## $ BLK. : num 2.8 3.9 2.1 2.9 7.6 12.7 3 3.2 2.7 2 ...
## $ iFOW : int 2 0 6 2 0 0 178 354 0 5 ...
## $ iFOL : int 3 3 9 2 0 0 259 324 1 13 ...
## $ iFOW.1 : int 2 0 6 2 0 0 178 354 0 5 ...
## $ iFOL.1 : int 3 3 9 2 0 0 259 324 1 13 ...
## $ FO. : num 40 0 40 50 0 0 40.7 52.2 0 27.8 ...
## $ X.FOT : num 0.4 1.7 1.2 0.4 0 0 78.9 59.1 2.3 1.1 ...
## $ dzFOW : int 2 0 0 0 0 0 52 81 0 1 ...
## $ dzFOL : int 2 0 0 1 0 0 99 105 0 1 ...
## $ nzFOW : int 0 0 0 1 0 0 71 101 0 0 ...
## $ nzFOL : int 0 0 1 0 0 0 96 89 0 2 ...
## $ ozFOW : int 0 0 6 1 0 0 55 172 0 4 ...
## $ ozFOL : int 1 3 8 1 0 0 64 130 1 10 ...
## $ FOW.Up : int 2 0 4 1 0 0 52 106 0 1 ...
## $ FOL.Up : int 2 2 3 1 0 0 75 101 0 3 ...
## $ FOW.Down : int 0 0 1 0 0 0 62 125 0 0 ...
## $ FOL.Down : int 1 0 2 0 0 0 111 105 1 7 ...
## $ FOW.Close : int 0 0 3 1 0 0 100 213 0 5 ...
## $ FOL.Close : int 2 1 6 2 0 0 135 211 0 5 ...
## $ OTG : int 0 0 0 0 0 0 0 0 0 1 ...
## $ X1G : int 2 2 8 2 0 0 1 2 0 3 ...
## $ GWG : int 1 2 5 3 0 0 0 4 0 9 ...
## $ ENG : int 0 0 0 0 0 0 0 0 0 1 ...
## $ PSG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PSA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ G.Bkhd : int 1 0 2 3 0 0 1 2 0 2 ...
## $ G.Dflct : int 0 0 1 1 0 0 0 0 0 0 ...
## $ G.Slap : int 0 0 0 4 1 0 0 1 0 3 ...
## $ G.Snap : int 4 1 1 2 1 1 0 5 0 8 ...
## $ G.Tip : int 2 2 2 0 0 0 0 0 0 1 ...
## $ G.Wrap : int 0 0 1 1 0 0 0 0 0 0 ...
## $ G.Wrst : int 7 0 20 7 0 0 4 7 0 17 ...
## $ CBar : int 0 0 0 0 0 0 0 2 0 1 ...
## $ Post : int 2 1 5 2 0 1 0 6 0 6 ...
## $ Over : int 3 2 3 3 0 0 4 13 2 23 ...
## [list output truncated]
colnames(train.final)
## [1] "Born" "City" "Pr.St" "Cntry" "Nat" "Ht" "Wt" "DftYr" "DftRd" "Ovrl" "Hand" "Last.Name" "First.Name"
## [14] "Position" "Team" "GP" "G" "A" "A1" "A2" "PTS" "X..." "E..." "PIM" "Shifts" "TOI"
## [27] "TOIX" "TOI.GP" "TOI.GP.1" "TOI." "IPP." "SH." "SV." "PDO" "F.60" "A.60" "Pct." "Diff" "Diff.60"
## [40] "iCF" "iCF.1" "iFF" "iSF" "iSF.1" "iSF.2" "ixG" "iSCF" "iRB" "iRS" "iDS" "sDist" "sDist.1"
## [53] "Pass" "iHF" "iHF.1" "iHA" "iHDf" "iMiss" "iGVA" "iTKA" "iBLK" "iGVA.1" "iTKA.1" "iBLK.1" "BLK."
## [66] "iFOW" "iFOL" "iFOW.1" "iFOL.1" "FO." "X.FOT" "dzFOW" "dzFOL" "nzFOW" "nzFOL" "ozFOW" "ozFOL" "FOW.Up"
## [79] "FOL.Up" "FOW.Down" "FOL.Down" "FOW.Close" "FOL.Close" "OTG" "X1G" "GWG" "ENG" "PSG" "PSA" "G.Bkhd" "G.Dflct"
## [92] "G.Slap" "G.Snap" "G.Tip" "G.Wrap" "G.Wrst" "CBar" "Post" "Over" "Wide" "S.Bkhd" "S.Dflct" "S.Slap" "S.Snap"
## [105] "S.Tip" "S.Wrap" "S.Wrst" "iPenT" "iPenD" "iPENT" "iPEND" "iPenDf" "NPD" "Min" "Maj" "Match" "Misc"
## [118] "Game" "CF" "CA" "FF" "FA" "SF" "SA" "xGF" "xGA" "SCF" "SCA" "GF" "GA"
## [131] "RBF" "RBA" "RSF" "RSA" "DSF" "DSA" "FOW" "FOL" "HF" "HA" "GVA" "TKA" "PENT"
## [144] "PEND" "OPS" "DPS" "PS" "OTOI" "Grit" "DAP" "Pace" "GS" "GS.G" "ANA" "FLA" "N.J"
## [157] "VAN" "ARI" "CGY" "MIN" "NYR" "TOR" "BOS" "WPG" "BUF" "CAR" "OTT" "PIT" "STL"
## [170] "CBJ" "DAL" "CHI" "COL" "MTL" "NSH" "S.J" "DET" "EDM" "L.A" "T.B" "NYI" "PHI"
## [183] "WSH" "C" "D" "LW" "RW" "birth_year" "birth_month" "birth_day" "born_AUT" "born_CAN" "born_CHE" "born_CZE" "born_DEU"
## [196] "born_DNK" "born_EST" "born_FIN" "born_FRA" "born_GBR" "born_HRV" "born_ITA" "born_LVA" "born_NOR" "born_RUS" "born_SVK" "born_SWE" "born_USA"
## [209] "born_SVN" "nation_AUT" "nation_CAN" "nation_CHE" "nation_CZE" "nation_DEU" "nation_DNK" "nation_FIN" "nation_FRA" "nation_GBR" "nation_HRV" "nation_LVA" "nation_NOR"
## [222] "nation_RUS" "nation_SVK" "nation_SWE" "nation_USA" "nation_SVN" "undrafted" "Salary"
cor(train.final$Salary, train.final$birth_year)
## [1] -0.425583
First determine the Interquartile range(IQR) for the feature for which you want to remove outlier. Here let us remove outlier from the birth year and replace the outlier point with maximum.
First do the boxplot to chec kthe outliers.
boxplot(train.final$birth_year, data = train.final)
You can note couple of points at the lower end. These outliers can be removed or replaced with suitable value. Here we will replace the outlierw with suitable value (Q1) Q1: Value of 1st quartile Q3: Value of 3rd quartile
new_value_birth_year = Q1 - 1.5×IQR
If the outliers were at top end then, new_value_birth_year = Q3 + 1.5 IQR.
Let us find the values of IQR, Q1, Q3, and new_value_birth_year
summary(train.final$birth_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1972 1987 1991 1990 1993 1998
## Min. 1st Qu. Median Mean 3rd Qu. Max. 1972 1987 1991
## 1990 1993 1998 From above we have
Q1 <- 1987
Q3 <- 1993
IQR <- Q3 - Q1
new_value_birth_year <- Q1 - (1.5 * IQR)
new_value_birth_year
## [1] 1978
Now replace the outliers with new_value
for (i in 1:nrow(train.final)) {
if (train.final$birth_year[i] < new_value_birth_year)
train.final$birth_year[i] <- new_value_birth_year
}
Again plot hte boxplot to see if the outliers exist. Ideally the outliers should be gone.
boxplot(train.final$birth_year, data = train.final)
*3.1 Split the training data into train and test data for validations
*3.2 Run the Linear gression model
*3.3 Predict uisng new model.
*3.4 Check accuracy.
train.final <- train.final[, -c(1, 2, 12, 13, 14, 15)]
test.final <- test.final[, -c(1, 2, 12, 13, 14, 15)]
train.final <- train.final[1:612, ]
test.final <- train.final[613:874, ]
dim(train.final)
## [1] 612 222
dim(test.final)
## [1] 262 222
# Normalize Height (Ht)
mean_ht <- mean(train.final$Ht) # Store mean value of the columns
std_ht <- sd(train.final$Ht) # Store the standard deviation of the columns
mean_ht
## [1] 72.98366
std_ht
## [1] 2.08016
for (i in 1:nrow(train.final)) {
# A for loop to compute the normalized value of each row
# element of given column
train.final$Ht_n[i] <- (train.final$Ht[i] - mean(train.final$Ht))/sd(train.final$Ht)
}
head(train.final$Ht_n)
## [1] 0.4885874 0.4885874 -0.4728772 1.9307843 1.4500520 -1.4343418
# mean(train.final$Ht_n)
# Normalize Weight (Wt)
mean_wt <- mean(train.final$Wt) # Store mean value of the columns
std_wt <- sd(train.final$Wt) # Store the standard deviation of the columns
mean_wt
## [1] 200.7451
std_wt
## [1] 14.95242
for (i in 1:nrow(train.final)) {
# A for loop to compute the normalized value of each row
# element of given column
train.final$Wt_n[i] <- (train.final$Wt[i] - mean(train.final$Wt))/sd(train.final$Wt)
}
head(train.final$Wt_n)
## [1] -0.7186193 0.4183204 1.1539873 1.2877449 1.0871085 -0.5848617
# mean(train.final$Ht_n)
# Normalize Weight (Wt)
mean_birth_year <- mean(train.final$birth_year) # Store mean value of the columns
std_birth_year <- sd(train.final$birth_year) # Store the standard deviation of the columns
mean_birth_year
## [1] 1990.052
std_birth_year
## [1] 4.430649
for (i in 1:nrow(train.final)) {
# A for loop to compute the normalized value of each row
# element of given column
train.final$birth_year_n[i] <- (train.final$birth_year[i] -
mean(train.final$birth_year))/sd(train.final$birth_year)
}
head(train.final$birth_year_n)
## [1] 1.5681027 0.6653004 -0.4632025 0.4395998 0.8910010 -2.4945077
# mean(train.final$Ht_n)
Based on the initial EDA, we have identified 4 independ variables for this linear regression model namely Height, Weight, Ovrl, Goal(G), and Birth Year. We are passing the normalized values of the same to the linear model.
lm_model <- lm(Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
data = train.final)
lm_model
##
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
## data = train.final)
##
## Coefficients:
## (Intercept) Ht_n Wt_n Ovrl G birth_year_n
## 1715122 33198 225698 -6197 121757 -878801
summary(lm_model)
##
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
## data = train.final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4538603 -980546 -155020 810226 9119438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1715122 125844 13.629 < 2e-16 ***
## Ht_n 33198 98116 0.338 0.7352
## Wt_n 225698 98702 2.287 0.0226 *
## Ovrl -6197 1205 -5.141 3.69e-07 ***
## G 121757 7881 15.450 < 2e-16 ***
## birth_year_n -878801 70309 -12.499 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1636000 on 606 degrees of freedom
## Multiple R-squared: 0.4689, Adjusted R-squared: 0.4646
## F-statistic: 107 on 5 and 606 DF, p-value: < 2.2e-16
predicted_salary <- predict(lm_model, train.final)
# predicted_salary
actual_preds <- data.frame(cbind(actuals = train.final$Salary,
predicteds = predicted_salary))
corrleation_accuracy <- cor(actual_preds)
corrleation_accuracy
## actuals predicteds
## actuals 1.0000000 0.6847967
## predicteds 0.6847967 1.0000000
head(actual_preds)
## actuals predicteds
## 1 925000 79559.3
## 2 2250000 1391652.5
## 3 8000000 4636946.7
## 4 3500000 1786708.4
## 5 1750000 1978758.3
## 6 1500000 3369764.9
# train.final <- train.final[,-c(1,2,12,13,14,15)] test.final
# <- test.final[,-c(1,2,12,13,14,15)]
data = train.final # Store the train data before spliting
split_indexes <- sample(1:nrow(data), size = 0.2 * nrow(data))
test <- data[split_indexes, ]
train <- data[-split_indexes, ]
dim(train)
## [1] 490 225
dim(test)
## [1] 122 225
lm_model <- lm(Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
data = train)
lm_model
##
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
## data = train)
##
## Coefficients:
## (Intercept) Ht_n Wt_n Ovrl G birth_year_n
## 1755235 35542 226425 -6381 119534 -886782
summary(lm_model)
##
## Call:
## lm(formula = Salary ~ Ht_n + Wt_n + Ovrl + G + birth_year_n,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4480124 -942693 -147147 848789 7989836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1755235 138631 12.661 < 2e-16 ***
## Ht_n 35542 109377 0.325 0.745
## Wt_n 226425 108845 2.080 0.038 *
## Ovrl -6381 1272 -5.015 7.46e-07 ***
## G 119535 8660 13.802 < 2e-16 ***
## birth_year_n -886782 75427 -11.757 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1609000 on 484 degrees of freedom
## Multiple R-squared: 0.4819, Adjusted R-squared: 0.4765
## F-statistic: 90.02 on 5 and 484 DF, p-value: < 2.2e-16
predicted_salary <- predict(lm_model, test)
actual_preds <- data.frame(cbind(actuals = test$Salary, predicteds = predicted_salary))
corrleation_accuracy <- cor(actual_preds)
corrleation_accuracy
## actuals predicteds
## actuals 1.0000000 0.6482224
## predicteds 0.6482224 1.0000000
head(actual_preds)
## actuals predicteds
## 518 700000 2732206.3
## 93 825000 1392633.5
## 194 700000 659478.5
## 158 4000000 1858637.2
## 490 875000 2436304.0
## 8 842500 1023810.6
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.