step3_attackefficiency.R
# create overall Attack Efficiency for every individual team
## every time there was an attack, was it 1, 0, or -1 (this is poss_eff)
### and give me the mean, so we sum all poss_eff outcomes and divide by attempts
#### we use a 2nd dataframe that we'll use for the opponent (team2)
a <- aggregate(poss_eff ~ team, subset(master, skill=="Attack" & enoughdata=="yes"), mean)
b <- a
colnames(a) <- c("team1", "AtkEff_t1")
colnames(b) <- c("team2", "AtkEff_t2")
c <- merge(a,b)

# find out per match, per set, which team won the set
## then get rid of the count, it doesn't matter
d <- aggregate(count ~ team*opponent*match_id*set_id*teamwonset, subset(master, enoughdata=="yes"), sum)
colnames(d) <- c("team1", "team2", "match_id", "set_id","won_the_set","count")
d$count <- NULL

# merge the 2 dataframes (by match and set id)
## then create the Attack Efficiency difference between the two teams in the match
e <- merge(c,d)
e$AtkEff_diff <- e$AtkEff_t1-e$AtkEff_t2
cor(e$AtkEff_diff, e$won_the_set)
## [1] 0.4303234
# load libraries - pscl for the pR2 function to get psuedo-R2 values
## caret for the createdatapartition function we use to split train/test sets
library(pscl)
library(caret)

# set seed to be able to replicate this run
## create training and testing sets using a 80/20 split
### train the model on the 80% of the data
#### show the model the 20% it hasn't seen and ask it to predict who won the set
##### if model predicts 50%+ chance of winning, classify it as a predicted win
set.seed(123)
intrain <- createDataPartition(e$won_the_set, p = 0.8, list = FALSE)
training_set <- e[intrain,]
testing_set <- e[-intrain,]
glm.fit <- glm(won_the_set ~ AtkEff_diff, family = binomial, data = training_set)
glm.probs <- predict(glm.fit, newdata = testing_set, type = "response")
glm.pred <- ifelse(glm.probs < 0.5, 1, 0)

# overall summary of the model
summary(glm.fit)
## 
## Call:
## glm(formula = won_the_set ~ AtkEff_diff, family = binomial, data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8217  -1.0020  -0.2194   1.0034   2.8813  
## 
## Coefficients:
##              Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -0.005691   0.011276  -0.505               0.614    
## AtkEff_diff 19.500205   0.252967  77.086 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53949  on 38915  degrees of freedom
## Residual deviance: 45782  on 38914  degrees of freedom
## AIC: 45786
## 
## Number of Fisher Scoring iterations: 4
# pseudo-R2 value to measure "variance explained by model"
pR2(glm.fit)
## fitting null model for pseudo-r2
##            llh        llhNull             G2       McFadden           r2ML           r2CU 
## -22891.0889843 -26974.2918115   8166.4056543      0.1513739      0.1892917      0.2523899
# confusion matrix to test accuracy of model at prediction
cm <- table(glm.pred, testing_set$won_the_set)
cm
##         
## glm.pred    0    1
##        0 1542 3344
##        1 3257 1586
# how often is the model correct at predicting the future?
accuracy <- (cm[1,1] + cm[2,2])/sum(cm)
accuracy
## [1] 0.321513
# load library and draw a logistic plot w/ histograms of how data is distributed for W vs. L
library(popbio)
logi.hist.plot(e$AtkEff_diff, e$won_the_set, boxp=FALSE, type = "hist", col = "gray")