# import the libraries we'll need
library(tidyverse)
library(caret)
library(pscl)

# always make sure the data is in chronological order when using lag/lead functions that look at the last or next row
master <- master[order(master$id_master),] 

# create output situation column
## if Block follows Attack, return whatever happens directly after the Block
### if Block does not follow Attack, return whatever happens after directly Attack
master$output_situation <- NA
master$output_situation <- ifelse(master$skill=="Attack" & lead(master$add_cover,1)=="Block", master$two_touch_future, master$output_situation)
master$output_situation <- ifelse(master$skill=="Attack" & lead(master$add_cover,1)!="Block", master$one_touch_future, master$output_situation)

# if outcome is terminal, label appropriately
master$output_situation <- ifelse(master$skill=="Attack" & master$skq=="Attack #", "Kill", master$output_situation)
master$output_situation <- ifelse(master$skill=="Attack" & master$skq=="Attack =", "Attack Error", master$output_situation)
master$output_situation <- ifelse(master$skill=="Attack" & master$skq=="Attack /", "Attack Blocked", master$output_situation)

# if outcome is within this list of logical options, keep it, otherwise toss it
master$output_situation <- ifelse(master$output_situation %in% c("Kill", "Attack Error", "Attack Blocked", "Dig +", "Dig #", "Dig -", "Cover #", "Cover +", "Cover -"), master$output_situation, NA)

# find the expected value of each unique output situation by using rally_eff (does the team win or lose the rally at any point after this contact)
## if won rally, 1. if lost rally, -1. then find the mean to determine efficiency.
a <- aggregate(rally_eff ~ output_situation, subset(master, skill=="Attack"), mean)
colnames(a)[2] <- "eV_output"

# merge this dataframe of outputs and expected values with the main dataframe of all contacts
master <- merge(master, a, by=c("output_situation"), all.x = TRUE)
## Warning in merge.data.frame(master, a, by = c("output_situation"), all.x = TRUE): column names 'eV_output.x', 'eV_output.y' are duplicated in the result
# reorder in case I did something stupid...
master <- master[order(master$id_master),] 

# volleymetrics sometimes has illogical touches that create non-exact efficiencies. fix that so terminal is always 1 or -1
master$eV_output <- ifelse(master$output_situation=="Attack Error", -1, master$eV_output)
master$eV_output <- ifelse(master$output_situation=="Kill", 1, master$eV_output)


# create overall Attack Expected Value for every individual team
# we use a 2nd dataframe (b) that we'll use for the opponent (team2)
a <- aggregate(eV_output ~ team, subset(master, skill=="Attack" & enoughdata=="yes"), mean)
b <- a
colnames(a) <- c("team1","eV_attack_t1")
colnames(b) <- c("team2", "eV_attack_t2")
c <- merge(a,b)

# find out per match, per set, which team won the set
## then get rid of the count, it doesn't matter
d <- aggregate(count ~ team*opponent*match_id*set_id*teamwonset, subset(master, enoughdata=="yes"), sum)
colnames(d) <- c("team1", "team2","match_id", "set_id","won_the_set","count")
d$count <- NULL

# merge the 2 dataframes (by match and set id)
## then create the Attack Expected Value difference between the two teams competing in the match
e <- merge(c,d)
e$eV_attack_diff <- e$eV_attack_t1-e$eV_attack_t2
cor(e$eV_attack_diff, e$won_the_set)
## [1] 0.4332513
# set seed to be able to replicate this run
## create training and testing sets using a 80/20 split
### train the model on the 80% of the data
#### show the model the 20% it hasn't seen and ask it to predict who won the setß
##### if model predicts 50%+ chance of winning, classify it as a predicted win
set.seed(123)
intrain <- createDataPartition(e$won_the_set, p = 0.8, list = FALSE)
training_set <- e[intrain,]
testing_set <- e[-intrain,]
glm.fit <- glm(won_the_set ~ eV_attack_diff, family = binomial, data = training_set)
glm.probs <- predict(glm.fit, newdata = testing_set, type = "response")
glm.pred <- ifelse(glm.probs < 0.5, 1, 0)

# overall summary of the model
summary(glm.fit)
## 
## Call:
## glm(formula = won_the_set ~ eV_attack_diff, family = binomial, 
##     data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7893  -0.9999  -0.2096   1.0007   2.7907  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.002038   0.011320   -0.18    0.857    
## eV_attack_diff 18.042503   0.233326   77.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53683  on 38723  degrees of freedom
## Residual deviance: 45455  on 38722  degrees of freedom
## AIC: 45459
## 
## Number of Fisher Scoring iterations: 4
# pseudo-R2 value to measure "variance explained by model"
pR2(glm.fit)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML          r2CU 
## -2.272747e+04 -2.684132e+04  8.227697e+03  1.532655e-01  1.914156e-01  2.552213e-01
# confusion matrix to test accuracy of model at prediction
cm <- table(glm.pred, testing_set$won_the_set)
cm
##         
## glm.pred    0    1
##        0 1544 3309
##        1 3252 1576
# how often is the model correct at predicting the future?
accuracy <- (cm[1,1] + cm[2,2])/sum(cm)
accuracy
## [1] 0.3222808
# load library and draw a logistic plot w/ histograms of how data is distributed for W vs. L
library(popbio)
logi.hist.plot(e$eV_attack_diff, e$won_the_set, boxp=FALSE, type = "hist", col = "gray")

# find the number of swings per player and only keep them if they have > 100 attacks
z <- aggregate(count ~ team*player_name, subset(master, skill=="Attack" & enoughdata=="yes"), sum)
y <- subset(z, count > 100)

# find the historical eV, historical hitting eff, and actual (per match) hitting eff for each player
a <- aggregate(eV_output ~ team*player_name, subset(master, skill=="Attack" & enoughdata=="yes" & player_name %in% y$player_name), mean)
b <- aggregate(poss_eff ~ team*player_name, subset(master, skill=="Attack" & enoughdata=="yes" & player_name %in% y$player_name), mean)
c <- aggregate(poss_eff ~ team*match_id*player_name, subset(master, skill=="Attack" & enoughdata=="yes" & player_name %in% y$player_name), mean)
colnames(a)[3] <- "historical_eV"
colnames(b)[3] <- "historical_atkeff"
colnames(c)[4] <- "actual_atkeff"
d <- merge(a,b)
e <- merge(d,c)

# check the correlation between each of the variables
cor(e$historical_eV, e$actual_atkeff)
## [1] 0.3842989
cor(e$historical_atkeff, e$actual_atkeff)
## [1] 0.3868309
cor(e$historical_eV, e$historical_atkeff)
## [1] 0.9938573