Author: Diego Veliz-Otani

Class Update: Causal Inference (PREV 794)

1. Load the clean data:

age_at_1997 ab_1994 ab_1995 ab_1996 ab_1997 ab_50 hits_1994 hits_1995 hits_1996 hits_1997 hits_50 BA_1994 BA_1995 BA_1996 BA_1997 ba_50 age_at_1996
29 392 517 588 412 50 120 155 193 137 13 0.3061224 0.2998066 0.3282313 0.3325243 0.26 28
33 453 554 579 590 50 119 145 172 170 19 0.2626932 0.2617329 0.2970639 0.2881356 0.38 32
26 250 178 248 397 50 74 43 56 105 12 0.2960000 0.2415730 0.2258065 0.2644836 0.24 25
32 332 352 407 320 50 82 88 105 83 16 0.2469880 0.2500000 0.2579853 0.2593750 0.32 31
36 444 550 640 615 50 140 144 178 166 20 0.3153153 0.2618182 0.2781250 0.2699187 0.40 35
32 134 415 537 528 50 35 133 157 150 13 0.2611940 0.3204819 0.2923650 0.2840909 0.26 31
26 132 438 84 166 50 36 109 20 34 11 0.2727273 0.2488584 0.2380952 0.2048193 0.22 25
35 120 NA 415 158 50 22 NA 119 48 15 0.1833333 NA 0.2867470 0.3037975 0.30 34
31 436 554 626 614 50 139 172 181 156 13 0.3188073 0.3104693 0.2891374 0.2540717 0.26 30
33 189 216 234 228 50 49 63 52 54 11 0.2592593 0.2916667 0.2222222 0.2368421 0.22 32
32 143 150 174 259 50 39 40 40 66 9 0.2727273 0.2666667 0.2298851 0.2548263 0.18 31
31 391 428 525 509 50 99 113 126 120 8 0.2531969 0.2640187 0.2400000 0.2357564 0.16 30

2. Approach 1: Extreme gradient boosting:

We assume that the batting average of Year “i” can be predicted by the batting average from years “i-1” and “i-2”. Thus, we first train a model to predict the BA from 1996 using data from 1994 and 1995. Then we use this model to predict the 1997 BA using as predictors the data from 1995 and 1996.

##################### Training data for 1996:
target_1996 <- dat %>% dplyr::rename(ab_2="ab_1994", 
                                     ab_1="ab_1995", 
                                     ab="ab_1996") %>%
  select(!dplyr::contains("1997")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1994", 
         hits_1="hits_1995", 
         hits="hits_1996",
         ba_2="BA_1994",
         ba_1="BA_1995",
         ba="BA_1996") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2) 

ba1996 <- target_1996$ba
target_1996 <- target_1996 %>%
                select(-ba) %>%
                rename(age="age_at_1996") %>%
                as.matrix() 
                
##################### Training data for 1997:

target_1997 <- dat %>% dplyr::rename(ab_2="ab_1995", 
                                     ab_1="ab_1996", 
                                     ab="ab_1997") %>%
  select(!dplyr::contains("1994")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1995", 
         hits_1="hits_1996", 
         hits="hits_1997",
         ba_2="BA_1995",
         ba_1="BA_1996",
         ba="BA_1997") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2, -age_at_1996) 

ba1997 <- target_1997$ba
target_1997 <- target_1997 %>%
  select(-ba) %>%
  rename(age="age_at_1997") %>%
  select("ba_2", "ba_1", "age") %>%
  as.matrix() 
  
dtrain <- xgb.DMatrix(data = target_1996,
                      label=ba1996)

mod1 <- xgboost(data = dtrain, 
                      objective = "reg:squarederror",
                      max.depth = 2,
                      eta=0.3,
                      nthread = 10,
                      nrounds = 10)
## [1]  train-rmse:0.172746 
## [2]  train-rmse:0.126960 
## [3]  train-rmse:0.094575 
## [4]  train-rmse:0.071373 
## [5]  train-rmse:0.054567 
## [6]  train-rmse:0.042605 
## [7]  train-rmse:0.034214 
## [8]  train-rmse:0.028359 
## [9]  train-rmse:0.024263 
## [10] train-rmse:0.021105
pred <- predict(mod1, target_1997)
predictions$pred2 <- pred ## XGBoost

3. Approach 2: Bayesian approach (update prior beliefs with new available data):

This approach consists of three groups of parameters:

The first step is to estimate the parameters of a Beta distribution that results in the same mean and variance as the data from 1996. We assume the variance from the 1996 BA can be estimated from a Binomial distribution. For that purpose, we write the function GetBetaParameters below:

GetBetaParameters <- function(Mean, Var){
  alpha <- ((1 - Mean) / Var - 1 / Mean) * Mean ^ 2
  beta <- alpha * (1 / Mean - 1)
  params <- list(alpha=alpha, beta=beta)
  return(params)
}

We then use the first 50 At Bats from 1997 to update our initial belief. We write the function UpdateBeta that takes as input the parameters of the initial beta distribution (the prior) and updates it with the new data (likelihood from a Binomial distribution).

UpdateBeta <- function(priorAlpha, priorBeta, N, K){
  ## N: Number of Bernoulli trials
  ## K: Number of events among the Bernoulli trials
  NewAlpha <- priorAlpha + N
  NewBeta <- priorBeta + N - K
  newP <- NewAlpha/(NewAlpha + NewBeta)
  newparams <-list(alpha=NewAlpha, beta=NewBeta, posteriorP=newP)
  return(newparams)
}

We now call the functions we created and get predictions for 1997.

Wrapper <- function(priorN, priorK, newN, newK){
  priorMean <- priorK/priorN
  priorVar <- priorMean*(1-priorMean)/priorN
  priorPars <- GetBetaParameters(Mean = priorMean, Var = priorVar)
  newPars <- UpdateBeta(priorAlpha = priorPars$alpha, 
                        priorBeta =priorPars$beta,
                        N = newN,
                        K = newK)
  return(newPars)
}

predictions$pred3 <-NA
for(i in 1:nrow(predictions)){
  predictions$pred3[i] <- Wrapper(priorN = predictions$ab_1996[i],
          priorK = predictions$hits_1996[i],
          newN = 50,
          newK = predictions$hits_50[i])$posteriorP
}

#4. Comparison of the thre approaches

Finally, we compare the predictions we got from using the mean of 1996 (mse1), XGBoost (mse2), and the Bayesian approach (mse3). We see that simply using the mean from the previous year yields the estimates with the lowest MSE.

mse1 <- mean((predictions$BA_1997-predictions$pred1)^2)
mse2 <- mean((predictions$BA_1997-predictions$pred2)^2)
mse3 <- mean((predictions$BA_1997-predictions$pred3)^2)

mse <-data.frame(approach=c("1996 BA", "XGBoost", "Bayesian"),
                 mse=c(mse1, mse2, mse3))

predictions$d1_2 <- (predictions$BA_1997-predictions$pred1)^2
predictions$d2_2 <- (predictions$BA_1997-predictions$pred2)^2
predictions$d3_2 <- (predictions$BA_1997-predictions$pred3)^2

kable(predictions)
age_at_1997 ab_1994 ab_1995 ab_1996 ab_1997 ab_50 hits_1994 hits_1995 hits_1996 hits_1997 hits_50 BA_1994 BA_1995 BA_1996 BA_1997 ba_50 age_at_1996 pred1 pred2 pred3 d1_2 d2_2 d3_2
29 392 517 588 412 50 120 155 193 137 13 0.3061224 0.2998066 0.3282313 0.3325243 0.26 28 0.3282313 0.2932082 0.3600471 0.0000184 0.0015458 0.0007575
33 453 554 579 590 50 119 145 172 170 19 0.2626932 0.2617329 0.2970639 0.2881356 0.38 32 0.2970639 0.2981814 0.3364233 0.0000797 0.0001009 0.0023317
26 250 178 248 397 50 74 43 56 105 12 0.2960000 0.2415730 0.2258065 0.2644836 0.24 25 0.2258065 0.2622708 0.3157439 0.0014959 0.0000049 0.0026276
32 332 352 407 320 50 82 88 105 83 16 0.2469880 0.2500000 0.2579853 0.2593750 0.32 31 0.2579853 0.2673427 0.3158000 0.0000019 0.0000635 0.0031838
36 444 550 640 615 50 140 144 178 166 20 0.3153153 0.2618182 0.2781250 0.2699187 0.40 35 0.2781250 0.2636214 0.3167203 0.0000673 0.0000397 0.0021904
32 134 415 537 528 50 35 133 157 150 13 0.2611940 0.3204819 0.2923650 0.2840909 0.26 31 0.2923650 0.2791395 0.3317940 0.0000685 0.0000245 0.0022756
26 132 438 84 166 50 36 109 20 34 11 0.2727273 0.2488584 0.2380952 0.2048193 0.22 25 0.2380952 0.2622708 0.4055925 0.0011073 0.0033007 0.0403099
35 120 NA 415 158 50 22 NA 119 48 15 0.1833333 NA 0.2867470 0.3037975 0.30 34 0.2867470 0.2749053 0.3381027 0.0002907 0.0008348 0.0011768
31 436 554 626 614 50 139 172 181 156 13 0.3188073 0.3104693 0.2891374 0.2540717 0.26 30 0.2891374 0.2912042 0.3240321 0.0012296 0.0013788 0.0048945
33 189 216 234 228 50 49 63 52 54 11 0.2592593 0.2916667 0.2222222 0.2368421 0.22 32 0.2222222 0.2628336 0.3160801 0.0002137 0.0006756 0.0062787
32 143 150 174 259 50 39 40 40 66 9 0.2727273 0.2666667 0.2298851 0.2548263 0.18 31 0.2298851 0.2673427 0.3400383 0.0006221 0.0001567 0.0072611
31 391 428 525 509 50 99 113 126 120 8 0.2531969 0.2640187 0.2400000 0.2357564 0.16 30 0.2400000 0.2622708 0.2853247 0.0000180 0.0007030 0.0024570
kable(mse, caption="Mean Square error")
Mean Square error
approach mse
1996 BA 0.0004344
XGBoost 0.0007357
Bayesian 0.0063120