1. Load the clean data:

age_at_1997	ab_1994	ab_1995	ab_1996	ab_1997	ab_50	hits_1994	hits_1995	hits_1996	hits_1997	hits_50	BA_1994	BA_1995	BA_1996	BA_1997	ba_50	age_at_1996
29	392	517	588	412	50	120	155	193	137	13	0.3061224	0.2998066	0.3282313	0.3325243	0.26	28
33	453	554	579	590	50	119	145	172	170	19	0.2626932	0.2617329	0.2970639	0.2881356	0.38	32
26	250	178	248	397	50	74	43	56	105	12	0.2960000	0.2415730	0.2258065	0.2644836	0.24	25
32	332	352	407	320	50	82	88	105	83	16	0.2469880	0.2500000	0.2579853	0.2593750	0.32	31
36	444	550	640	615	50	140	144	178	166	20	0.3153153	0.2618182	0.2781250	0.2699187	0.40	35
32	134	415	537	528	50	35	133	157	150	13	0.2611940	0.3204819	0.2923650	0.2840909	0.26	31
26	132	438	84	166	50	36	109	20	34	11	0.2727273	0.2488584	0.2380952	0.2048193	0.22	25
35	120	NA	415	158	50	22	NA	119	48	15	0.1833333	NA	0.2867470	0.3037975	0.30	34
31	436	554	626	614	50	139	172	181	156	13	0.3188073	0.3104693	0.2891374	0.2540717	0.26	30
33	189	216	234	228	50	49	63	52	54	11	0.2592593	0.2916667	0.2222222	0.2368421	0.22	32
32	143	150	174	259	50	39	40	40	66	9	0.2727273	0.2666667	0.2298851	0.2548263	0.18	31
31	391	428	525	509	50	99	113	126	120	8	0.2531969	0.2640187	0.2400000	0.2357564	0.16	30

2. Approach 1: Extreme gradient boosting:

We assume that the batting average of Year “i” can be predicted by the batting average from years “i-1” and “i-2”. Thus, we first train a model to predict the BA from 1996 using data from 1994 and 1995. Then we use this model to predict the 1997 BA using as predictors the data from 1995 and 1996.

##################### Training data for 1996:
target_1996 <- dat %>% dplyr::rename(ab_2="ab_1994", 
                                     ab_1="ab_1995", 
                                     ab="ab_1996") %>%
  select(!dplyr::contains("1997")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1994", 
         hits_1="hits_1995", 
         hits="hits_1996",
         ba_2="BA_1994",
         ba_1="BA_1995",
         ba="BA_1996") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2) 

ba1996 <- target_1996$ba
target_1996 <- target_1996 %>%
                select(-ba) %>%
                rename(age="age_at_1996") %>%
                as.matrix() 
                
##################### Training data for 1997:

target_1997 <- dat %>% dplyr::rename(ab_2="ab_1995", 
                                     ab_1="ab_1996", 
                                     ab="ab_1997") %>%
  select(!dplyr::contains("1994")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1995", 
         hits_1="hits_1996", 
         hits="hits_1997",
         ba_2="BA_1995",
         ba_1="BA_1996",
         ba="BA_1997") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2, -age_at_1996) 

ba1997 <- target_1997$ba
target_1997 <- target_1997 %>%
  select(-ba) %>%
  rename(age="age_at_1997") %>%
  select("ba_2", "ba_1", "age") %>%
  as.matrix() 
  
dtrain <- xgb.DMatrix(data = target_1996,
                      label=ba1996)

mod1 <- xgboost(data = dtrain, 
                      objective = "reg:squarederror",
                      max.depth = 2,
                      eta=0.3,
                      nthread = 10,
                      nrounds = 10)

## [1]  train-rmse:0.172746 
## [2]  train-rmse:0.126960 
## [3]  train-rmse:0.094575 
## [4]  train-rmse:0.071373 
## [5]  train-rmse:0.054567 
## [6]  train-rmse:0.042605 
## [7]  train-rmse:0.034214 
## [8]  train-rmse:0.028359 
## [9]  train-rmse:0.024263 
## [10] train-rmse:0.021105

pred <- predict(mod1, target_1997)
predictions$pred2 <- pred ## XGBoost

3. Approach 2: Bayesian approach (update prior beliefs with new available data):

This approach consists of three groups of parameters:

Prior belief: Estimate from previous years (in this case, 1996). Modeled as a Beta distribution (Conjugate prior of a binomial distribution).
Likelihood (new data): Data from the first 50 At Bats from 1997. This data will be used to update our prior belief. The likelihood takes the form of a Binomial distribution.
Posterior distribution: Result of updating our prior belief of the distribution of BA once new information is available. Takes the form of a Beta distribution.

The first step is to estimate the parameters of a Beta distribution that results in the same mean and variance as the data from 1996. We assume the variance from the 1996 BA can be estimated from a Binomial distribution. For that purpose, we write the function GetBetaParameters below:

GetBetaParameters <- function(Mean, Var){
  alpha <- ((1 - Mean) / Var - 1 / Mean) * Mean ^ 2
  beta <- alpha * (1 / Mean - 1)
  params <- list(alpha=alpha, beta=beta)
  return(params)
}

We then use the first 50 At Bats from 1997 to update our initial belief. We write the function UpdateBeta that takes as input the parameters of the initial beta distribution (the prior) and updates it with the new data (likelihood from a Binomial distribution).

UpdateBeta <- function(priorAlpha, priorBeta, N, K){
  ## N: Number of Bernoulli trials
  ## K: Number of events among the Bernoulli trials
  NewAlpha <- priorAlpha + N
  NewBeta <- priorBeta + N - K
  newP <- NewAlpha/(NewAlpha + NewBeta)
  newparams <-list(alpha=NewAlpha, beta=NewBeta, posteriorP=newP)
  return(newparams)
}

We now call the functions we created and get predictions for 1997.

Wrapper <- function(priorN, priorK, newN, newK){
  priorMean <- priorK/priorN
  priorVar <- priorMean*(1-priorMean)/priorN
  priorPars <- GetBetaParameters(Mean = priorMean, Var = priorVar)
  newPars <- UpdateBeta(priorAlpha = priorPars$alpha, 
                        priorBeta =priorPars$beta,
                        N = newN,
                        K = newK)
  return(newPars)
}

predictions$pred3 <-NA
for(i in 1:nrow(predictions)){
  predictions$pred3[i] <- Wrapper(priorN = predictions$ab_1996[i],
          priorK = predictions$hits_1996[i],
          newN = 50,
          newK = predictions$hits_50[i])$posteriorP
}

#4. Comparison of the thre approaches

Finally, we compare the predictions we got from using the mean of 1996 (mse1), XGBoost (mse2), and the Bayesian approach (mse3). We see that simply using the mean from the previous year yields the estimates with the lowest MSE.

mse1 <- mean((predictions$BA_1997-predictions$pred1)^2)
mse2 <- mean((predictions$BA_1997-predictions$pred2)^2)
mse3 <- mean((predictions$BA_1997-predictions$pred3)^2)

mse <-data.frame(approach=c("1996 BA", "XGBoost", "Bayesian"),
                 mse=c(mse1, mse2, mse3))

predictions$d1_2 <- (predictions$BA_1997-predictions$pred1)^2
predictions$d2_2 <- (predictions$BA_1997-predictions$pred2)^2
predictions$d3_2 <- (predictions$BA_1997-predictions$pred3)^2

kable(predictions)

age_at_1997	ab_1994	ab_1995	ab_1996	ab_1997	ab_50	hits_1994	hits_1995	hits_1996	hits_1997	hits_50	BA_1994	BA_1995	BA_1996	BA_1997	ba_50	age_at_1996	pred1	pred2	pred3	d1_2	d2_2	d3_2
29	392	517	588	412	50	120	155	193	137	13	0.3061224	0.2998066	0.3282313	0.3325243	0.26	28	0.3282313	0.2932082	0.3600471	0.0000184	0.0015458	0.0007575
33	453	554	579	590	50	119	145	172	170	19	0.2626932	0.2617329	0.2970639	0.2881356	0.38	32	0.2970639	0.2981814	0.3364233	0.0000797	0.0001009	0.0023317
26	250	178	248	397	50	74	43	56	105	12	0.2960000	0.2415730	0.2258065	0.2644836	0.24	25	0.2258065	0.2622708	0.3157439	0.0014959	0.0000049	0.0026276
32	332	352	407	320	50	82	88	105	83	16	0.2469880	0.2500000	0.2579853	0.2593750	0.32	31	0.2579853	0.2673427	0.3158000	0.0000019	0.0000635	0.0031838
36	444	550	640	615	50	140	144	178	166	20	0.3153153	0.2618182	0.2781250	0.2699187	0.40	35	0.2781250	0.2636214	0.3167203	0.0000673	0.0000397	0.0021904
32	134	415	537	528	50	35	133	157	150	13	0.2611940	0.3204819	0.2923650	0.2840909	0.26	31	0.2923650	0.2791395	0.3317940	0.0000685	0.0000245	0.0022756
26	132	438	84	166	50	36	109	20	34	11	0.2727273	0.2488584	0.2380952	0.2048193	0.22	25	0.2380952	0.2622708	0.4055925	0.0011073	0.0033007	0.0403099
35	120	NA	415	158	50	22	NA	119	48	15	0.1833333	NA	0.2867470	0.3037975	0.30	34	0.2867470	0.2749053	0.3381027	0.0002907	0.0008348	0.0011768
31	436	554	626	614	50	139	172	181	156	13	0.3188073	0.3104693	0.2891374	0.2540717	0.26	30	0.2891374	0.2912042	0.3240321	0.0012296	0.0013788	0.0048945
33	189	216	234	228	50	49	63	52	54	11	0.2592593	0.2916667	0.2222222	0.2368421	0.22	32	0.2222222	0.2628336	0.3160801	0.0002137	0.0006756	0.0062787
32	143	150	174	259	50	39	40	40	66	9	0.2727273	0.2666667	0.2298851	0.2548263	0.18	31	0.2298851	0.2673427	0.3400383	0.0006221	0.0001567	0.0072611
31	391	428	525	509	50	99	113	126	120	8	0.2531969	0.2640187	0.2400000	0.2357564	0.16	30	0.2400000	0.2622708	0.2853247	0.0000180	0.0007030	0.0024570

kable(mse, caption="Mean Square error")

Mean Square error
approach	mse
1996 BA	0.0004344
XGBoost	0.0007357
Bayesian	0.0063120

Batting average

Author: Diego Veliz-Otani

Class Update: Causal Inference (PREV 794)

1. Load the clean data:

2. Approach 1: Extreme gradient boosting:

3. Approach 2: Bayesian approach (update prior beliefs with new available data):