Hockey Bootstrap and Cross Validation

This is a continuation of this project <https://rpubs.com/nurfnick/720194

Bootstrapping

set.seed(43)
samp_mean <- function(x, i) {
  mean(x[i])
}


GP <- na.omit(data$GP) 
results <- boot(GP, samp_mean, 100)
plot(results)

boot.ci(results, type="perc")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 100 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 3.052,  3.500 )  
## Calculations and Intervals on Original Scale
## Some percentile intervals may be unstable

t.test(GP, mu = 4)

## 
##  One Sample t-test
## 
## data:  GP
## t = -6.8861, df = 29, p-value = 1.448e-07
## alternative hypothesis: true mean is not equal to 4
## 95 percent confidence interval:
##  3.048862 3.484472
## sample estimates:
## mean of x 
##  3.266667

START HERE>>>> \[ t = \frac{\mu-\overline x}{SE} \]

results$t0

## [1] 3.266667

SE = sd(results$t)
c(results$t0-2*SE,results$t0+2*SE)

## [1] 3.059878 3.473455

t = (3.5-results$t0)/SE

1-pt(t,99)

## [1] 0.01311133

mean(3.5<=results$t)

## [1] 0.03

data

##    Rk                     X AvAge GP  W  L OL PTS  PTS. GF GA SOW SOL   SRS
## 1   1   Philadelphia Flyers  27.2  4  3  1  0   6 0.750 15 11   0   0  0.38
## 2   2   Toronto Maple Leafs  29.0  4  3  1  0   6 0.750 14 12   0   0  1.85
## 3   3  Vegas Golden Knights  28.8  3  3  0  0   6 1.000 11  5   0   0  1.63
## 4   4   Washington Capitals  30.2  4  2  0  2   6 0.750 15 13   0   1 -0.38
## 5   5     New Jersey Devils  26.1  3  2  0  1   5 0.833  8  6   0   1  0.13
## 6   6    Montreal Canadiens  28.6  3  2  0  1   5 0.833 12  7   0   0  0.83
## 7   7        Calgary Flames  28.1  3  2  0  1   5 0.833 11  6   0   0  0.29
## 8   8    Colorado Avalanche  27.1  3  2  1  0   4 0.667 12  6   0   0  0.63
## 9   9       St. Louis Blues  28.3  3  2  1  0   4 0.667  9 13   0   0 -0.63
## 10 10         Winnipeg Jets  28.0  3  2  1  0   4 0.667  9  9   0   0  1.27
## 11 11     Detroit Red Wings  29.1  4  2  2  0   4 0.500  9 10   0   0 -0.39
## 12 12    New York Islanders  28.8  3  2  1  0   4 0.667  5  5   0   0 -0.13
## 13 13   Carolina Hurricanes  27.0  3  2  1  0   4 0.667  9  6   0   0  1.20
## 14 14      Florida Panthers  27.8  2  2  0  0   4 1.000 10  6   0   0 -0.75
## 15 15   Nashville Predators  28.7  3  2  1  0   4 0.667 10  7   0   0  0.51
## 16 16   Tampa Bay Lightning  27.6  2  2  0  0   4 1.000 10  3   0   0  0.75
## 17 17        Minnesota Wild  29.6  3  2  1  0   4 0.667  8  7   0   0 -0.38
## 18 18   Pittsburgh Penguins  28.7  4  2  2  0   4 0.500 13 18   1   0 -0.38
## 19 19         Anaheim Ducks  28.1  3  1  1  1   3 0.500  4  7   0   0  0.88
## 20 20 Columbus Blue Jackets  26.6  4  1  2  1   3 0.375  8 13   0   0 -0.90
## 21 21         Boston Bruins  28.8  3  1  1  1   3 0.500  3  5   1   0 -0.13
## 22 22       Ottawa Senators  26.9  3  1  1  1   3 0.500 10 10   0   0  1.66
## 23 23       Arizona Coyotes  28.5  3  1  1  1   3 0.500 10 10   0   1 -0.13
## 24 24        Buffalo Sabres  27.3  4  1  3  0   2 0.250 11 12   0   0  0.38
## 25 25       Edmonton Oilers  27.7  4  1  3  0   2 0.250 10 15   0   0 -2.19
## 26 26      New York Rangers  25.6  3  1  2  0   2 0.333  8  8   0   0  0.13
## 27 27     Vancouver Canucks  27.4  4  1  3  0   2 0.250  9 16   0   0 -2.69
## 28 28     Los Angeles Kings  28.2  3  0  1  2   2 0.333  8 11   0   0 -0.13
## 29 29       San Jose Sharks  29.3  3  1  2  0   2 0.333 10 13   1   0 -1.88
## 30 30    Chicago Blackhawks  26.9  4  0  3  1   1 0.125  9 20   0   0  0.00
## 31 31          Dallas Stars    NA NA NA NA NA  NA    NA NA NA  NA  NA    NA
##      SOS TG.G EVGF EVGA PP PPO   PP. PPA PPOA    PK. SH SHA PIM.G oPIM.G   S
## 1  -0.63 6.50   12    7  3  12 25.00   4   15  73.33  0   0   7.5    6.0  99
## 2   1.35 6.50    8    9  6  14 42.86   3   17  82.35  0   0  13.3    9.3 135
## 3  -0.38 5.33   10    3  0   7  0.00   1    8  87.50  1   1   5.3    4.7  90
## 4  -0.63 7.00   13    9  2   8 25.00   3   16  81.25  0   1   9.3    5.3  98
## 5  -0.21 4.67    7    1  1  10 10.00   4   13  69.23  0   1  11.0    9.0  81
## 6  -0.84 6.33    6    4  4  10 40.00   2   14  85.71  2   1  11.0    8.3 102
## 7  -1.38 5.67    5    4  6  16 37.50   1   12  91.67  0   1   8.7   11.3  93
## 8  -1.38 6.00    4    5  8  17 47.06   1    9  88.89  0   0   6.7   12.0  89
## 9   0.71 7.33    9    5  0   9  0.00   8   14  42.86  0   0  10.0    6.7  86
## 10  1.27 6.00    7    5  2  13 15.38   4    9  55.56  0   0   7.3   10.0  90
## 11 -0.14 4.75    8    8  1   7 14.29   2   11  81.82  0   0  11.8    9.8  99
## 12 -0.13 3.33    3    4  2  17 11.76   1   14  92.86  0   0  14.7   12.7  73
## 13  0.20 5.00    6    5  3  12 25.00   1    9  88.89  0   0   7.7    9.7  98
## 14 -2.75 8.00    7    3  3   8 37.50   3    5  40.00  0   0   5.0    8.0  64
## 15 -0.49 5.67    9    6  1  11  9.09   1    8  87.50  0   0   5.3    7.3 101
## 16 -2.75 6.50    8    1  2   7 28.57   2    7  71.43  0   0   7.0    7.0  70
## 17 -0.71 5.00    8    6  0  16  0.00   1   11  90.91  0   0   8.7   12.0 105
## 18  0.63 7.75    8   14  4  16 25.00   4   15  73.33  1   0   7.5    8.0 124
## 19  1.88 3.67    4    7  0   5  0.00   0    7 100.00  0   0   5.3    4.0  71
## 20  0.35 5.25    8   12  0   7  0.00   1    9  88.89  0   0   9.5    8.5 127
## 21  0.21 2.67    0    5  2  10 20.00   0   13 100.00  1   0  11.7    9.7  92
## 22  1.66 6.67    7    6  3  15 20.00   4   14  71.43  0   0  10.0   14.0  84
## 23  0.21 6.67    6    6  3  12 25.00   3   13  76.92  1   1  10.0    9.3  95
## 24  0.63 5.75    8   11  3  15 20.00   1    5  80.00  0   0   3.8    8.8 134
## 25 -0.94 6.25    7   11  2  18 11.11   2   14  85.71  1   2   9.5   11.5 138
## 26  0.13 5.33    5    5  3  17 17.65   3   14  78.57  0   0  10.0   16.0 106
## 27 -0.94 6.25    8    9  0  15  0.00   7   21  66.67  1   0  13.0   10.0 134
## 28  0.88 6.33    6    9  2  12 16.67   2   17  88.24  0   0  12.0    8.7  92
## 29 -1.21 7.67    5   11  5  11 45.45   2   10  80.00  0   0   8.0    8.7  95
## 30  2.75 7.25    4   15  5  12 41.67   5   15  66.67  0   0   7.5    6.0 124
## 31    NA   NA   NA   NA NA  NA    NA  NA   NA     NA NA  NA    NA     NA  NA
##      S.  SA   SV. SO
## 1  15.2 144 0.924  1
## 2  10.4 103 0.883  0
## 3  12.2  76 0.934  0
## 4  15.3 114 0.886  0
## 5   9.9 115 0.948  0
## 6  11.8  95 0.926  0
## 7  11.8  93 0.935  1
## 8  13.5  78 0.923  1
## 9  10.5  91 0.857  0
## 10 10.0 105 0.914  0
## 11  9.1 127 0.921  0
## 12  6.8  83 0.940  2
## 13  9.2  68 0.912  1
## 14 15.6  64 0.906  0
## 15  9.9  98 0.929  0
## 16 14.3  60 0.950  0
## 17  7.6  94 0.926  0
## 18 10.5  96 0.813  0
## 19  5.6  96 0.927  1
## 20  6.3 132 0.902  0
## 21  3.3  70 0.929  0
## 22 11.9  91 0.890  0
## 23 10.5  97 0.897  0
## 24  8.2 101 0.881  0
## 25  7.2 145 0.897  0
## 26  7.5  84 0.905  1
## 27  6.7 144 0.889  0
## 28  8.7  95 0.884  0
## 29 10.5  96 0.865  0
## 30  7.3 134 0.851  0
## 31   NA  NA    NA NA

data$GP[is.na(data$GP)] <- 0

data$GP

##  [1] 4 4 3 4 3 3 3 3 3 3 4 3 3 2 3 2 3 4 3 4 3 3 3 4 4 3 4 3 3 4 0

samp_min <- function(x, i) {
  min(x[i])
}

results2 <- boot(data$GP,samp_min, R = 10000)
plot(results2)

xbar = mean(results2$t)
s = sd(results2$t)
c(xbar - 2*s,xbar + 2*s)

## [1] -1.30744  2.83684

boot.ci(results2, type = "perc")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results2, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 0,  3 )  
## Calculations and Intervals on Original Scale

mean(0 == results2$t)

## [1] 0.6391

%%%%Start Here

Cross Validation

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

## Loading required package: ggplot2

data$W[is.na(data$W)] <- 0
data$PTS[is.na(data$PTS)] <- 0

trainingSamples <- createDataPartition(data$Rk ,p=.66,list = FALSE)
trainingSamples

##       Resample1
##  [1,]         1
##  [2,]         3
##  [3,]         5
##  [4,]         6
##  [5,]         7
##  [6,]         8
##  [7,]         9
##  [8,]        10
##  [9,]        13
## [10,]        14
## [11,]        15
## [12,]        16
## [13,]        17
## [14,]        19
## [15,]        20
## [16,]        22
## [17,]        23
## [18,]        24
## [19,]        26
## [20,]        27
## [21,]        28
## [22,]        29
## [23,]        30

trainData <- data[trainingSamples,]
testData <- data[-trainingSamples,]
testData

##    Rk                   X AvAge GP W  L OL PTS  PTS. GF GA SOW SOL   SRS   SOS
## 2   2 Toronto Maple Leafs  29.0  4 3  1  0   6 0.750 14 12   0   0  1.85  1.35
## 4   4 Washington Capitals  30.2  4 2  0  2   6 0.750 15 13   0   1 -0.38 -0.63
## 11 11   Detroit Red Wings  29.1  4 2  2  0   4 0.500  9 10   0   0 -0.39 -0.14
## 12 12  New York Islanders  28.8  3 2  1  0   4 0.667  5  5   0   0 -0.13 -0.13
## 18 18 Pittsburgh Penguins  28.7  4 2  2  0   4 0.500 13 18   1   0 -0.38  0.63
## 21 21       Boston Bruins  28.8  3 1  1  1   3 0.500  3  5   1   0 -0.13  0.21
## 25 25     Edmonton Oilers  27.7  4 1  3  0   2 0.250 10 15   0   0 -2.19 -0.94
## 31 31        Dallas Stars    NA  0 0 NA NA   0    NA NA NA  NA  NA    NA    NA
##    TG.G EVGF EVGA PP PPO   PP. PPA PPOA    PK. SH SHA PIM.G oPIM.G   S   S.  SA
## 2  6.50    8    9  6  14 42.86   3   17  82.35  0   0  13.3    9.3 135 10.4 103
## 4  7.00   13    9  2   8 25.00   3   16  81.25  0   1   9.3    5.3  98 15.3 114
## 11 4.75    8    8  1   7 14.29   2   11  81.82  0   0  11.8    9.8  99  9.1 127
## 12 3.33    3    4  2  17 11.76   1   14  92.86  0   0  14.7   12.7  73  6.8  83
## 18 7.75    8   14  4  16 25.00   4   15  73.33  1   0   7.5    8.0 124 10.5  96
## 21 2.67    0    5  2  10 20.00   0   13 100.00  1   0  11.7    9.7  92  3.3  70
## 25 6.25    7   11  2  18 11.11   2   14  85.71  1   2   9.5   11.5 138  7.2 145
## 31   NA   NA   NA NA  NA    NA  NA   NA     NA NA  NA    NA     NA  NA   NA  NA
##      SV. SO
## 2  0.883  0
## 4  0.886  0
## 11 0.921  0
## 12 0.940  2
## 18 0.813  0
## 21 0.929  0
## 25 0.897  0
## 31    NA NA

model <- lm(W ~ PTS , data = trainData)
summary(model)

## 
## Call:
## lm(formula = W ~ PTS, data = trainData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7105 -0.2566  0.1974  0.1974  0.2895 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.38158    0.17806  -2.143    0.044 *  
## PTS          0.54605    0.04687  11.651 1.25e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2951 on 21 degrees of freedom
## Multiple R-squared:  0.866,  Adjusted R-squared:  0.8597 
## F-statistic: 135.8 on 1 and 21 DF,  p-value: 1.252e-10

data[trainingSamples,'Test_Train'] <- "Train"
data[-trainingSamples,'Test_Train'] <- "Test"
data$Test_Train

##  [1] "Train" "Test"  "Train" "Test"  "Train" "Train" "Train" "Train" "Train"
## [10] "Train" "Test"  "Test"  "Train" "Train" "Train" "Train" "Train" "Test" 
## [19] "Train" "Train" "Test"  "Train" "Train" "Train" "Test"  "Train" "Train"
## [28] "Train" "Train" "Train" "Test"

col.rainbow <- rainbow(2)
palette(col.rainbow)
plot(data$PTS,data$W,pch =19,col = as.factor(data$Test_Train))
abline(model)

library(ggplot2)
ggplot(data = data, mapping = aes(PTS,W,color = Test_Train))+
  geom_jitter() +
  geom_smooth(method = lm)

## `geom_smooth()` using formula 'y ~ x'

pre <- predict(model,data[-trainingSamples,])
pre

##          2          4         11         12         18         21         25 
##  2.8947368  2.8947368  1.8026316  1.8026316  1.8026316  1.2565789  0.7105263 
##         31 
## -0.3815789

R2(pre,data[-trainingSamples,"W"]  )

## [1] 0.861082

%%%Start Here

train.control <- trainControl(method = "cv", number = 10)
model <- train(W ~ PTS, data = data, 
               method = "lm",
               trControl = train.control)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

# Summarize the results
print(model)

## Linear Regression 
## 
## 31 samples
##  1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 28, 29, 28, 28, 27, 27, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.3002804  0.9405655  0.2767682
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

summary(model)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7956 -0.2889  0.2044  0.2111  0.2178 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.19771    0.14867   -1.33    0.194    
## PTS          0.49666    0.03839   12.94 1.42e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3155 on 29 degrees of freedom
## Multiple R-squared:  0.8523, Adjusted R-squared:  0.8473 
## F-statistic: 167.4 on 1 and 29 DF,  p-value: 1.423e-13

Hockey Bootstrap and Cross Validation

Nicholas Jacob

2/4/2021

Bootstrapping

Cross Validation