Batting average

# install.packages("xgboost")

library(xgboost)
library(dplyr)
library(ggplot2)
library(knitr)
setwd("/mnt/d/Dropbox/umsom/III_2021_fall_semester/causal_inference/batting_average/bat_data")

raw_dat <- read.csv("batting_data_ready.csv")
raw_dat$id <- 1:nrow(raw_dat)
  
id_mapping <- raw_dat[,c("Name", "short_name", "id")]
dat<- raw_dat%>%select(-"Name", -"short_name", -"id")
dat$age_at_1996 <- dat$age_at_1997-1

predictions <- dat
predictions$pred1 <-predictions$BA_1996 ## Prediction based on the previous year

kable(dat)

age_at_1997	ab_1994	ab_1995	ab_1996	ab_1997	ab_50	hits_1994	hits_1995	hits_1996	hits_1997	hits_50	BA_1994	BA_1995	BA_1996	BA_1997	ba_50	age_at_1996
29	392	517	588	412	50	120	155	193	137	13	0.3061224	0.2998066	0.3282313	0.3325243	0.26	28
33	453	554	579	590	50	119	145	172	170	19	0.2626932	0.2617329	0.2970639	0.2881356	0.38	32
26	250	178	248	397	50	74	43	56	105	12	0.2960000	0.2415730	0.2258065	0.2644836	0.24	25
32	332	352	407	320	50	82	88	105	83	16	0.2469880	0.2500000	0.2579853	0.2593750	0.32	31
36	444	550	640	615	50	140	144	178	166	20	0.3153153	0.2618182	0.2781250	0.2699187	0.40	35
32	134	415	537	528	50	35	133	157	150	13	0.2611940	0.3204819	0.2923650	0.2840909	0.26	31
26	132	438	84	166	50	36	109	20	34	11	0.2727273	0.2488584	0.2380952	0.2048193	0.22	25
35	120	NA	415	158	50	22	NA	119	48	15	0.1833333	NA	0.2867470	0.3037975	0.30	34
31	436	554	626	614	50	139	172	181	156	13	0.3188073	0.3104693	0.2891374	0.2540717	0.26	30
33	189	216	234	228	50	49	63	52	54	11	0.2592593	0.2916667	0.2222222	0.2368421	0.22	32
32	143	150	174	259	50	39	40	40	66	9	0.2727273	0.2666667	0.2298851	0.2548263	0.18	31
31	391	428	525	509	50	99	113	126	120	8	0.2531969	0.2640187	0.2400000	0.2357564	0.16	30

##################### Training data for 1996:

target_1996 <- dat %>% dplyr::rename(ab_2="ab_1994", 
                                     ab_1="ab_1995", 
                                     ab="ab_1996") %>%
  select(!dplyr::contains("1997")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1994", 
         hits_1="hits_1995", 
         hits="hits_1996",
         ba_2="BA_1994",
         ba_1="BA_1995",
         ba="BA_1996") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2) 

ba1996 <- target_1996$ba
target_1996 <- target_1996 %>%
                select(-ba) %>%
                rename(age="age_at_1996") %>%
                as.matrix() 
                
######################################
##################### Training data for 1997:

target_1997 <- dat %>% dplyr::rename(ab_2="ab_1995", 
                                     ab_1="ab_1996", 
                                     ab="ab_1997") %>%
  select(!dplyr::contains("1994")) %>%
  select(-ab_50, -ba_50, -hits_50) %>%
  rename(hits_2="hits_1995", 
         hits_1="hits_1996", 
         hits="hits_1997",
         ba_2="BA_1995",
         ba_1="BA_1996",
         ba="BA_1997") %>%
  dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2, -age_at_1996) 

ba1997 <- target_1997$ba
target_1997 <- target_1997 %>%
  select(-ba) %>%
  rename(age="age_at_1997") %>%
  select("ba_2", "ba_1", "age") %>%
  as.matrix() 
  
dtrain <- xgb.DMatrix(data = target_1996,
                      label=ba1996)

mod1 <- xgboost(data = dtrain, 
                      objective = "reg:squarederror",
                      max.depth = 2,
                      eta=0.3,
                      nthread = 10,
                      nrounds = 10)

## [1]  train-rmse:0.172746 
## [2]  train-rmse:0.126960 
## [3]  train-rmse:0.094575 
## [4]  train-rmse:0.071373 
## [5]  train-rmse:0.054567 
## [6]  train-rmse:0.042605 
## [7]  train-rmse:0.034214 
## [8]  train-rmse:0.028359 
## [9]  train-rmse:0.024263 
## [10] train-rmse:0.021105

pred <- predict(mod1, target_1997)
predictions$pred2 <- pred ## XGBoost

######################################################
#### BAYESIAN APPROACH
######################################################

GetBetaParameters <- function(Mean, Var){
  alpha <- ((1 - Mean) / Var - 1 / Mean) * Mean ^ 2
  beta <- alpha * (1 / Mean - 1)
  params <- list(alpha=alpha, beta=beta)
  return(params)
}

UpdateBeta <- function(priorAlpha, priorBeta, N, K){
  ## N: Number of Bernoulli trials
  ## K: Number of events among the Bernoulli trials
  NewAlpha <- priorAlpha + N
  NewBeta <- priorBeta + N - K
  newP <- NewAlpha/(NewAlpha + NewBeta)
  newparams <-list(alpha=NewAlpha, beta=NewBeta, posteriorP=newP)
  return(newparams)
}


Wrapper <- function(priorN, priorK, newN, newK){
  priorMean <- priorK/priorN
  priorVar <- priorMean*(1-priorMean)/priorN
  priorPars <- GetBetaParameters(Mean = priorMean, Var = priorVar)
  newPars <- UpdateBeta(priorAlpha = priorPars$alpha, 
                        priorBeta =priorPars$beta,
                        N = newN,
                        K = newK)
  return(newPars)
}

predictions$pred3 <-NA
for(i in 1:nrow(predictions)){
  predictions$pred3[i] <- Wrapper(priorN = predictions$ab_1996[i],
          priorK = predictions$hits_1996[i],
          newN = 50,
          newK = predictions$hits_50[i])$posteriorP
}

mse1 <- mean((predictions$BA_1997-predictions$pred1)^2)
mse2 <- mean((predictions$BA_1997-predictions$pred2)^2)
mse3 <- mean((predictions$BA_1997-predictions$pred3)^2)

mse <-data.frame(approach=c("1996 BA", "XGBoost", "Bayesian"),
                 mse=c(mse1, mse2, mse3))

predictions$d1_2 <- (predictions$BA_1997-predictions$pred1)^2
predictions$d2_2 <- (predictions$BA_1997-predictions$pred2)^2
predictions$d3_2 <- (predictions$BA_1997-predictions$pred3)^2

kable(predictions)

age_at_1997	ab_1994	ab_1995	ab_1996	ab_1997	ab_50	hits_1994	hits_1995	hits_1996	hits_1997	hits_50	BA_1994	BA_1995	BA_1996	BA_1997	ba_50	age_at_1996	pred1	pred2	pred3	d1_2	d2_2	d3_2
29	392	517	588	412	50	120	155	193	137	13	0.3061224	0.2998066	0.3282313	0.3325243	0.26	28	0.3282313	0.2932082	0.3600471	0.0000184	0.0015458	0.0007575
33	453	554	579	590	50	119	145	172	170	19	0.2626932	0.2617329	0.2970639	0.2881356	0.38	32	0.2970639	0.2981814	0.3364233	0.0000797	0.0001009	0.0023317
26	250	178	248	397	50	74	43	56	105	12	0.2960000	0.2415730	0.2258065	0.2644836	0.24	25	0.2258065	0.2622708	0.3157439	0.0014959	0.0000049	0.0026276
32	332	352	407	320	50	82	88	105	83	16	0.2469880	0.2500000	0.2579853	0.2593750	0.32	31	0.2579853	0.2673427	0.3158000	0.0000019	0.0000635	0.0031838
36	444	550	640	615	50	140	144	178	166	20	0.3153153	0.2618182	0.2781250	0.2699187	0.40	35	0.2781250	0.2636214	0.3167203	0.0000673	0.0000397	0.0021904
32	134	415	537	528	50	35	133	157	150	13	0.2611940	0.3204819	0.2923650	0.2840909	0.26	31	0.2923650	0.2791395	0.3317940	0.0000685	0.0000245	0.0022756
26	132	438	84	166	50	36	109	20	34	11	0.2727273	0.2488584	0.2380952	0.2048193	0.22	25	0.2380952	0.2622708	0.4055925	0.0011073	0.0033007	0.0403099
35	120	NA	415	158	50	22	NA	119	48	15	0.1833333	NA	0.2867470	0.3037975	0.30	34	0.2867470	0.2749053	0.3381027	0.0002907	0.0008348	0.0011768
31	436	554	626	614	50	139	172	181	156	13	0.3188073	0.3104693	0.2891374	0.2540717	0.26	30	0.2891374	0.2912042	0.3240321	0.0012296	0.0013788	0.0048945
33	189	216	234	228	50	49	63	52	54	11	0.2592593	0.2916667	0.2222222	0.2368421	0.22	32	0.2222222	0.2628336	0.3160801	0.0002137	0.0006756	0.0062787
32	143	150	174	259	50	39	40	40	66	9	0.2727273	0.2666667	0.2298851	0.2548263	0.18	31	0.2298851	0.2673427	0.3400383	0.0006221	0.0001567	0.0072611
31	391	428	525	509	50	99	113	126	120	8	0.2531969	0.2640187	0.2400000	0.2357564	0.16	30	0.2400000	0.2622708	0.2853247	0.0000180	0.0007030	0.0024570

kable(mse, caption="Mean Square error")

Mean Square error
approach	mse
1996 BA	0.0004344
XGBoost	0.0007357
Bayesian	0.0063120