Part1:Use only blue team data

Data Preparation

setwd('C:\\Zillionaire\\Study_Material\\HU\\HU Courses\\ANLY 510\\final pj')
lol <- data.frame(read.csv('bluedata.csv'))
#clean data(check for NAs)
sapply(lol,function(x) sum(is.na(x)))
##                     blueWins              blueWardsPlaced 
##                            0                            0 
##           blueWardsDestroyed               blueFirstBlood 
##                            0                            0 
##                    blueKills                   blueDeaths 
##                            0                            0 
##                  blueAssists                blueTotalGold 
##                            0                            0 
##                 blueAvgLevel          blueTotalExperience 
##                            0                            0 
##       blueTotalMinionsKilled blueTotalJungleMinionsKilled 
##                            0                            0
#no NA

#Distinguish train set and test set
lol_train <- lol[1:9000,]
lol_test <- lol[9001:9879,]

#check for data bias
table(lol_train$blueWins)
## 
##    0    1 
## 4492 4508
table(lol_test$blueWins)
## 
##   0   1 
## 457 422
#number of 0 and 1 are almost equal, no bias

Check Assumptions

#multicollinearity
lol_m1 <- cor(lol)
corrplot(lol_m1, method = "circle")

#gold, kills, experience and level have a little bit higher correlation, need special attention

Do the model

#stepwise regression
lrm1 <- glm(blueWins ~ .,data = lol_train, family = 'binomial') %>% stepAIC(trace = F)
summary(lrm1)
## 
## Call:
## glm(formula = blueWins ~ blueFirstBlood + blueKills + blueDeaths + 
##     blueAssists + blueTotalGold + blueTotalJungleMinionsKilled, 
##     family = "binomial", data = lol_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6466  -0.9089   0.1409   0.9091   2.6879  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -8.444e+00  5.796e-01 -14.568  < 2e-16 ***
## blueFirstBlood                8.472e-02  5.147e-02   1.646  0.09978 .  
## blueKills                     9.560e-02  2.297e-02   4.163 3.15e-05 ***
## blueDeaths                   -2.594e-01  1.021e-02 -25.398  < 2e-16 ***
## blueAssists                  -2.856e-02  1.070e-02  -2.668  0.00763 ** 
## blueTotalGold                 5.471e-04  4.331e-05  12.632  < 2e-16 ***
## blueTotalJungleMinionsKilled  1.122e-02  2.819e-03   3.980 6.89e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12476.6  on 8999  degrees of freedom
## Residual deviance:  9806.4  on 8993  degrees of freedom
## AIC: 9820.4
## 
## Number of Fisher Scoring iterations: 4
#remove firstblood&assists
lrm1_rev <- glm(blueWins ~ blueKills + blueDeaths + blueTotalGold 
            + blueTotalJungleMinionsKilled,data = lol_train, family = 'binomial')
summary(lrm1_rev)
## 
## Call:
## glm(formula = blueWins ~ blueKills + blueDeaths + blueTotalGold + 
##     blueTotalJungleMinionsKilled, family = "binomial", data = lol_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6580  -0.9095   0.1405   0.9133   2.6879  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -8.326e+00  5.756e-01 -14.467  < 2e-16 ***
## blueKills                     7.227e-02  2.097e-02   3.446  0.00057 ***
## blueDeaths                   -2.616e-01  1.000e-02 -26.156  < 2e-16 ***
## blueTotalGold                 5.379e-04  4.269e-05  12.600  < 2e-16 ***
## blueTotalJungleMinionsKilled  1.209e-02  2.789e-03   4.333 1.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12476.6  on 8999  degrees of freedom
## Residual deviance:  9816.2  on 8995  degrees of freedom
## AIC: 9826.2
## 
## Number of Fisher Scoring iterations: 4
#since bluekill and god seems highly correlated, remove kills
lrm1_rev_1 <- glm(blueWins ~ blueDeaths + blueTotalGold 
            + blueTotalJungleMinionsKilled,data = lol_train, family = 'binomial')
summary(lrm1_rev_1)
## 
## Call:
## glm(formula = blueWins ~ blueDeaths + blueTotalGold + blueTotalJungleMinionsKilled, 
##     family = "binomial", data = lol_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6929  -0.9068   0.1321   0.9083   2.6397  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -9.922e+00  3.483e-01 -28.483  < 2e-16 ***
## blueDeaths                   -2.529e-01  9.636e-03 -26.244  < 2e-16 ***
## blueTotalGold                 6.698e-04  1.958e-05  34.205  < 2e-16 ***
## blueTotalJungleMinionsKilled  8.375e-03  2.570e-03   3.259  0.00112 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12476.6  on 8999  degrees of freedom
## Residual deviance:  9828.1  on 8996  degrees of freedom
## AIC: 9836.1
## 
## Number of Fisher Scoring iterations: 4
#scatterplot to test linearity
lrm1_predictors <- colnames(lol_train)
lrm1_probabilities <- predict(lrm1_rev_1, type = "response")
lrm1_plot <- lol_train %>%
  mutate(logit = log(lrm1_probabilities/(1-lrm1_probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(lrm1_plot, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'

Using test set to predict and pos hoc test

#prediction probability
lrm1_fit <- predict(lrm1_rev_1, newdata = lol_test, type = 'response')
lrm1_pred <- ifelse(lrm1_fit > 0.5,1,0)

lrm1_prob <- 1- mean(lrm1_pred != lol_test$blueWins)
lrm1_prob
## [1] 0.698521
#probability is 0.699, which is good

#ROC curve & AUC value
lrm1_pred_per <- prediction(lrm1_fit, lol_test$blueWins)
lrm1_per <- performance(lrm1_pred_per, measure = "tpr", x.measure = "fpr")
plot(lrm1_per)

#ROC curve is pretty convex

auc <- performance(lrm1_pred_per, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7826128
#auc value is 0.78, which means model fits good for test set

#Hosmer-Lemeshow goodness of fit
hoslem.test(lol_train$blueWins, fitted(lrm1_rev_1))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  lol_train$blueWins, fitted(lrm1_rev_1)
## X-squared = 4.9403, df = 8, p-value = 0.7639
#p value is 0.7639, which shows the model is a good fit

Part2: Do the model with both blue & red team variables

Data preparation

lol_2 <- data.frame(read.csv('blueandreddata.csv'))
lol_2_train <- lol_2[1:9000,]
lol_2_test <- lol_2[9001:9879,]

Check Assumptions

#multicollinearity
lol_m2 <- cor(lol_2)
corrplot(lol_m2, method = "circle")

#same issue with the blue only data; bluedeaths=redkills & bluefirstblood=no redfirstblood, vice versa

Do the model

#stepwise regression
lrm2 <- glm(blueWins ~ .,data = lol_2_train, family = 'binomial') %>% stepAIC(trace = F)
summary(lrm2)
## 
## Call:
## glm(formula = blueWins ~ blueWardsPlaced + blueFirstBlood + blueKills + 
##     blueTotalGold + blueTotalExperience + blueTotalJungleMinionsKilled + 
##     redWardsPlaced + redTotalGold + redTotalExperience + redTotalMinionsKilled, 
##     family = "binomial", data = lol_2_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5708  -0.8867   0.1198   0.8810   2.6806  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   3.136e-01  8.227e-01   0.381   0.7031    
## blueWardsPlaced              -1.938e-03  1.369e-03  -1.416   0.1569    
## blueFirstBlood                1.373e-01  5.404e-02   2.540   0.0111 *  
## blueKills                     3.435e-02  2.195e-02   1.565   0.1176    
## blueTotalGold                 3.827e-04  5.017e-05   7.627 2.40e-14 ***
## blueTotalExperience           2.056e-04  3.970e-05   5.178 2.25e-07 ***
## blueTotalJungleMinionsKilled  9.055e-03  3.071e-03   2.948   0.0032 ** 
## redWardsPlaced               -2.006e-03  1.348e-03  -1.488   0.1368    
## redTotalGold                 -4.113e-04  3.101e-05 -13.265  < 2e-16 ***
## redTotalExperience           -2.780e-04  3.843e-05  -7.234 4.70e-13 ***
## redTotalMinionsKilled         3.754e-03  1.471e-03   2.551   0.0107 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12476.6  on 8999  degrees of freedom
## Residual deviance:  9582.1  on 8989  degrees of freedom
## AIC: 9604.1
## 
## Number of Fisher Scoring iterations: 4
#remove unsignificant variables
lrm2_rev <- glm(blueWins ~ blueTotalGold + blueTotalExperience  
                + redTotalGold + redTotalExperience, data = lol_2_train, family = 'binomial')
summary(lrm2_rev)
## 
## Call:
## glm(formula = blueWins ~ blueTotalGold + blueTotalExperience + 
##     redTotalGold + redTotalExperience, family = "binomial", data = lol_2_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5768  -0.8906   0.1113   0.8852   2.7099  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -6.527e-02  7.020e-01  -0.093    0.926    
## blueTotalGold        4.297e-04  2.794e-05  15.379  < 2e-16 ***
## blueTotalExperience  2.399e-04  3.469e-05   6.914 4.71e-12 ***
## redTotalGold        -4.220e-04  2.836e-05 -14.883  < 2e-16 ***
## redTotalExperience  -2.429e-04  3.483e-05  -6.975 3.07e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12476.6  on 8999  degrees of freedom
## Residual deviance:  9610.3  on 8995  degrees of freedom
## AIC: 9620.3
## 
## Number of Fisher Scoring iterations: 4
#scatterplot to test linearity
lrm2_predictors <- colnames(lol_2_train)
lrm2_probabilities <- predict(lrm2_rev, type = "response")
lrm2_plot <- lol_2_train %>%
  mutate(logit = log(lrm2_probabilities/(1-lrm2_probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(lrm2_plot, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'

Using test set to predict and pos hoc test

#prediction probability
lrm2_fit <- predict(lrm2_rev, newdata = lol_2_test, type = 'response')
lrm2_pred <- ifelse(lrm2_fit > 0.5,1,0)

lrm2_prob <- 1- mean(lrm2_pred != lol_2_test$blueWins)
lrm2_prob
## [1] 0.7133106
#probability is 0.713, which is good

#ROC curve & AUC value
lrm2_pred_per <- prediction(lrm2_fit, lol_2_test$blueWins)
lrm2_per <- performance(lrm2_pred_per, measure = "tpr", x.measure = "fpr")
plot(lrm2_per)

#ROC curve is pretty convex

auc_2 <- performance(lrm2_pred_per, measure = "auc")
auc_2 <- auc_2@y.values[[1]]
auc_2
## [1] 0.7881662
#auc value is 0.788, which means model fits good for test set

#Hosmer-Lemeshow goodness of fit
hoslem.test(lol_2_train$blueWins, fitted(lrm2_rev))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  lol_2_train$blueWins, fitted(lrm2_rev)
## X-squared = 6.9992, df = 8, p-value = 0.5367
#p value is 0.5367, which shows the model is a good fit

Compare models

#test multicollinearity
car::vif(lrm1_rev_1)
##                   blueDeaths                blueTotalGold 
##                     1.047434                     1.013134 
## blueTotalJungleMinionsKilled 
##                     1.034349
car::vif(lrm2_rev)
##       blueTotalGold blueTotalExperience        redTotalGold  redTotalExperience 
##            2.014777            2.008464            1.987081            1.969319
#no multicolinearity since no result is greater than 5

#likelihood ratio rest to compare blue & blue+red models
lrtest(lrm1_rev_1, lrm2_rev)

Conclusion:

AIC/AUC/likelihood test all reveal that model with both teams’ variables are better than the one with just blue team, so model2 lrm2_rev is the final choice. Sumary write up is in the PowerPoint.