# install.packages("xgboost")
library(xgboost)
library(dplyr)
library(ggplot2)
library(knitr)
setwd("/mnt/d/Dropbox/umsom/III_2021_fall_semester/causal_inference/batting_average/bat_data")
raw_dat <- read.csv("batting_data_ready.csv")
raw_dat$id <- 1:nrow(raw_dat)
id_mapping <- raw_dat[,c("Name", "short_name", "id")]
dat<- raw_dat%>%select(-"Name", -"short_name", -"id")
dat$age_at_1996 <- dat$age_at_1997-1
predictions <- dat
predictions$pred1 <-predictions$BA_1996 ## Prediction based on the previous year
kable(dat)
| 29 |
392 |
517 |
588 |
412 |
50 |
120 |
155 |
193 |
137 |
13 |
0.3061224 |
0.2998066 |
0.3282313 |
0.3325243 |
0.26 |
28 |
| 33 |
453 |
554 |
579 |
590 |
50 |
119 |
145 |
172 |
170 |
19 |
0.2626932 |
0.2617329 |
0.2970639 |
0.2881356 |
0.38 |
32 |
| 26 |
250 |
178 |
248 |
397 |
50 |
74 |
43 |
56 |
105 |
12 |
0.2960000 |
0.2415730 |
0.2258065 |
0.2644836 |
0.24 |
25 |
| 32 |
332 |
352 |
407 |
320 |
50 |
82 |
88 |
105 |
83 |
16 |
0.2469880 |
0.2500000 |
0.2579853 |
0.2593750 |
0.32 |
31 |
| 36 |
444 |
550 |
640 |
615 |
50 |
140 |
144 |
178 |
166 |
20 |
0.3153153 |
0.2618182 |
0.2781250 |
0.2699187 |
0.40 |
35 |
| 32 |
134 |
415 |
537 |
528 |
50 |
35 |
133 |
157 |
150 |
13 |
0.2611940 |
0.3204819 |
0.2923650 |
0.2840909 |
0.26 |
31 |
| 26 |
132 |
438 |
84 |
166 |
50 |
36 |
109 |
20 |
34 |
11 |
0.2727273 |
0.2488584 |
0.2380952 |
0.2048193 |
0.22 |
25 |
| 35 |
120 |
NA |
415 |
158 |
50 |
22 |
NA |
119 |
48 |
15 |
0.1833333 |
NA |
0.2867470 |
0.3037975 |
0.30 |
34 |
| 31 |
436 |
554 |
626 |
614 |
50 |
139 |
172 |
181 |
156 |
13 |
0.3188073 |
0.3104693 |
0.2891374 |
0.2540717 |
0.26 |
30 |
| 33 |
189 |
216 |
234 |
228 |
50 |
49 |
63 |
52 |
54 |
11 |
0.2592593 |
0.2916667 |
0.2222222 |
0.2368421 |
0.22 |
32 |
| 32 |
143 |
150 |
174 |
259 |
50 |
39 |
40 |
40 |
66 |
9 |
0.2727273 |
0.2666667 |
0.2298851 |
0.2548263 |
0.18 |
31 |
| 31 |
391 |
428 |
525 |
509 |
50 |
99 |
113 |
126 |
120 |
8 |
0.2531969 |
0.2640187 |
0.2400000 |
0.2357564 |
0.16 |
30 |
##################### Training data for 1996:
target_1996 <- dat %>% dplyr::rename(ab_2="ab_1994",
ab_1="ab_1995",
ab="ab_1996") %>%
select(!dplyr::contains("1997")) %>%
select(-ab_50, -ba_50, -hits_50) %>%
rename(hits_2="hits_1994",
hits_1="hits_1995",
hits="hits_1996",
ba_2="BA_1994",
ba_1="BA_1995",
ba="BA_1996") %>%
dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2)
ba1996 <- target_1996$ba
target_1996 <- target_1996 %>%
select(-ba) %>%
rename(age="age_at_1996") %>%
as.matrix()
######################################
##################### Training data for 1997:
target_1997 <- dat %>% dplyr::rename(ab_2="ab_1995",
ab_1="ab_1996",
ab="ab_1997") %>%
select(!dplyr::contains("1994")) %>%
select(-ab_50, -ba_50, -hits_50) %>%
rename(hits_2="hits_1995",
hits_1="hits_1996",
hits="hits_1997",
ba_2="BA_1995",
ba_1="BA_1996",
ba="BA_1997") %>%
dplyr::select(-hits, -hits_2, -hits_1,-ab, -ab_1, -ab_2, -age_at_1996)
ba1997 <- target_1997$ba
target_1997 <- target_1997 %>%
select(-ba) %>%
rename(age="age_at_1997") %>%
select("ba_2", "ba_1", "age") %>%
as.matrix()
dtrain <- xgb.DMatrix(data = target_1996,
label=ba1996)
mod1 <- xgboost(data = dtrain,
objective = "reg:squarederror",
max.depth = 2,
eta=0.3,
nthread = 10,
nrounds = 10)
## [1] train-rmse:0.172746
## [2] train-rmse:0.126960
## [3] train-rmse:0.094575
## [4] train-rmse:0.071373
## [5] train-rmse:0.054567
## [6] train-rmse:0.042605
## [7] train-rmse:0.034214
## [8] train-rmse:0.028359
## [9] train-rmse:0.024263
## [10] train-rmse:0.021105
pred <- predict(mod1, target_1997)
predictions$pred2 <- pred ## XGBoost
######################################################
#### BAYESIAN APPROACH
######################################################
GetBetaParameters <- function(Mean, Var){
alpha <- ((1 - Mean) / Var - 1 / Mean) * Mean ^ 2
beta <- alpha * (1 / Mean - 1)
params <- list(alpha=alpha, beta=beta)
return(params)
}
UpdateBeta <- function(priorAlpha, priorBeta, N, K){
## N: Number of Bernoulli trials
## K: Number of events among the Bernoulli trials
NewAlpha <- priorAlpha + N
NewBeta <- priorBeta + N - K
newP <- NewAlpha/(NewAlpha + NewBeta)
newparams <-list(alpha=NewAlpha, beta=NewBeta, posteriorP=newP)
return(newparams)
}
Wrapper <- function(priorN, priorK, newN, newK){
priorMean <- priorK/priorN
priorVar <- priorMean*(1-priorMean)/priorN
priorPars <- GetBetaParameters(Mean = priorMean, Var = priorVar)
newPars <- UpdateBeta(priorAlpha = priorPars$alpha,
priorBeta =priorPars$beta,
N = newN,
K = newK)
return(newPars)
}
predictions$pred3 <-NA
for(i in 1:nrow(predictions)){
predictions$pred3[i] <- Wrapper(priorN = predictions$ab_1996[i],
priorK = predictions$hits_1996[i],
newN = 50,
newK = predictions$hits_50[i])$posteriorP
}
mse1 <- mean((predictions$BA_1997-predictions$pred1)^2)
mse2 <- mean((predictions$BA_1997-predictions$pred2)^2)
mse3 <- mean((predictions$BA_1997-predictions$pred3)^2)
mse <-data.frame(approach=c("1996 BA", "XGBoost", "Bayesian"),
mse=c(mse1, mse2, mse3))
predictions$d1_2 <- (predictions$BA_1997-predictions$pred1)^2
predictions$d2_2 <- (predictions$BA_1997-predictions$pred2)^2
predictions$d3_2 <- (predictions$BA_1997-predictions$pred3)^2
kable(predictions)
| 29 |
392 |
517 |
588 |
412 |
50 |
120 |
155 |
193 |
137 |
13 |
0.3061224 |
0.2998066 |
0.3282313 |
0.3325243 |
0.26 |
28 |
0.3282313 |
0.2932082 |
0.3600471 |
0.0000184 |
0.0015458 |
0.0007575 |
| 33 |
453 |
554 |
579 |
590 |
50 |
119 |
145 |
172 |
170 |
19 |
0.2626932 |
0.2617329 |
0.2970639 |
0.2881356 |
0.38 |
32 |
0.2970639 |
0.2981814 |
0.3364233 |
0.0000797 |
0.0001009 |
0.0023317 |
| 26 |
250 |
178 |
248 |
397 |
50 |
74 |
43 |
56 |
105 |
12 |
0.2960000 |
0.2415730 |
0.2258065 |
0.2644836 |
0.24 |
25 |
0.2258065 |
0.2622708 |
0.3157439 |
0.0014959 |
0.0000049 |
0.0026276 |
| 32 |
332 |
352 |
407 |
320 |
50 |
82 |
88 |
105 |
83 |
16 |
0.2469880 |
0.2500000 |
0.2579853 |
0.2593750 |
0.32 |
31 |
0.2579853 |
0.2673427 |
0.3158000 |
0.0000019 |
0.0000635 |
0.0031838 |
| 36 |
444 |
550 |
640 |
615 |
50 |
140 |
144 |
178 |
166 |
20 |
0.3153153 |
0.2618182 |
0.2781250 |
0.2699187 |
0.40 |
35 |
0.2781250 |
0.2636214 |
0.3167203 |
0.0000673 |
0.0000397 |
0.0021904 |
| 32 |
134 |
415 |
537 |
528 |
50 |
35 |
133 |
157 |
150 |
13 |
0.2611940 |
0.3204819 |
0.2923650 |
0.2840909 |
0.26 |
31 |
0.2923650 |
0.2791395 |
0.3317940 |
0.0000685 |
0.0000245 |
0.0022756 |
| 26 |
132 |
438 |
84 |
166 |
50 |
36 |
109 |
20 |
34 |
11 |
0.2727273 |
0.2488584 |
0.2380952 |
0.2048193 |
0.22 |
25 |
0.2380952 |
0.2622708 |
0.4055925 |
0.0011073 |
0.0033007 |
0.0403099 |
| 35 |
120 |
NA |
415 |
158 |
50 |
22 |
NA |
119 |
48 |
15 |
0.1833333 |
NA |
0.2867470 |
0.3037975 |
0.30 |
34 |
0.2867470 |
0.2749053 |
0.3381027 |
0.0002907 |
0.0008348 |
0.0011768 |
| 31 |
436 |
554 |
626 |
614 |
50 |
139 |
172 |
181 |
156 |
13 |
0.3188073 |
0.3104693 |
0.2891374 |
0.2540717 |
0.26 |
30 |
0.2891374 |
0.2912042 |
0.3240321 |
0.0012296 |
0.0013788 |
0.0048945 |
| 33 |
189 |
216 |
234 |
228 |
50 |
49 |
63 |
52 |
54 |
11 |
0.2592593 |
0.2916667 |
0.2222222 |
0.2368421 |
0.22 |
32 |
0.2222222 |
0.2628336 |
0.3160801 |
0.0002137 |
0.0006756 |
0.0062787 |
| 32 |
143 |
150 |
174 |
259 |
50 |
39 |
40 |
40 |
66 |
9 |
0.2727273 |
0.2666667 |
0.2298851 |
0.2548263 |
0.18 |
31 |
0.2298851 |
0.2673427 |
0.3400383 |
0.0006221 |
0.0001567 |
0.0072611 |
| 31 |
391 |
428 |
525 |
509 |
50 |
99 |
113 |
126 |
120 |
8 |
0.2531969 |
0.2640187 |
0.2400000 |
0.2357564 |
0.16 |
30 |
0.2400000 |
0.2622708 |
0.2853247 |
0.0000180 |
0.0007030 |
0.0024570 |
kable(mse, caption="Mean Square error")
Mean Square error
| 1996 BA |
0.0004344 |
| XGBoost |
0.0007357 |
| Bayesian |
0.0063120 |