setwd('C:\\Zillionaire\\Study_Material\\HU\\HU Courses\\ANLY 510\\final pj')
lol <- data.frame(read.csv('bluedata.csv'))
#clean data(check for NAs)
sapply(lol,function(x) sum(is.na(x)))
## blueWins blueWardsPlaced
## 0 0
## blueWardsDestroyed blueFirstBlood
## 0 0
## blueKills blueDeaths
## 0 0
## blueAssists blueTotalGold
## 0 0
## blueAvgLevel blueTotalExperience
## 0 0
## blueTotalMinionsKilled blueTotalJungleMinionsKilled
## 0 0
#no NA
#Distinguish train set and test set
lol_train <- lol[1:9000,]
lol_test <- lol[9001:9879,]
#check for data bias
table(lol_train$blueWins)
##
## 0 1
## 4492 4508
table(lol_test$blueWins)
##
## 0 1
## 457 422
#number of 0 and 1 are almost equal, no bias
#multicollinearity
lol_m1 <- cor(lol)
corrplot(lol_m1, method = "circle")
#gold, kills, experience and level have a little bit higher correlation, need special attention
#stepwise regression
lrm1 <- glm(blueWins ~ .,data = lol_train, family = 'binomial') %>% stepAIC(trace = F)
summary(lrm1)
##
## Call:
## glm(formula = blueWins ~ blueFirstBlood + blueKills + blueDeaths +
## blueAssists + blueTotalGold + blueTotalJungleMinionsKilled,
## family = "binomial", data = lol_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6466 -0.9089 0.1409 0.9091 2.6879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.444e+00 5.796e-01 -14.568 < 2e-16 ***
## blueFirstBlood 8.472e-02 5.147e-02 1.646 0.09978 .
## blueKills 9.560e-02 2.297e-02 4.163 3.15e-05 ***
## blueDeaths -2.594e-01 1.021e-02 -25.398 < 2e-16 ***
## blueAssists -2.856e-02 1.070e-02 -2.668 0.00763 **
## blueTotalGold 5.471e-04 4.331e-05 12.632 < 2e-16 ***
## blueTotalJungleMinionsKilled 1.122e-02 2.819e-03 3.980 6.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12476.6 on 8999 degrees of freedom
## Residual deviance: 9806.4 on 8993 degrees of freedom
## AIC: 9820.4
##
## Number of Fisher Scoring iterations: 4
#remove firstblood&assists
lrm1_rev <- glm(blueWins ~ blueKills + blueDeaths + blueTotalGold
+ blueTotalJungleMinionsKilled,data = lol_train, family = 'binomial')
summary(lrm1_rev)
##
## Call:
## glm(formula = blueWins ~ blueKills + blueDeaths + blueTotalGold +
## blueTotalJungleMinionsKilled, family = "binomial", data = lol_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6580 -0.9095 0.1405 0.9133 2.6879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.326e+00 5.756e-01 -14.467 < 2e-16 ***
## blueKills 7.227e-02 2.097e-02 3.446 0.00057 ***
## blueDeaths -2.616e-01 1.000e-02 -26.156 < 2e-16 ***
## blueTotalGold 5.379e-04 4.269e-05 12.600 < 2e-16 ***
## blueTotalJungleMinionsKilled 1.209e-02 2.789e-03 4.333 1.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12476.6 on 8999 degrees of freedom
## Residual deviance: 9816.2 on 8995 degrees of freedom
## AIC: 9826.2
##
## Number of Fisher Scoring iterations: 4
#since bluekill and god seems highly correlated, remove kills
lrm1_rev_1 <- glm(blueWins ~ blueDeaths + blueTotalGold
+ blueTotalJungleMinionsKilled,data = lol_train, family = 'binomial')
summary(lrm1_rev_1)
##
## Call:
## glm(formula = blueWins ~ blueDeaths + blueTotalGold + blueTotalJungleMinionsKilled,
## family = "binomial", data = lol_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6929 -0.9068 0.1321 0.9083 2.6397
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.922e+00 3.483e-01 -28.483 < 2e-16 ***
## blueDeaths -2.529e-01 9.636e-03 -26.244 < 2e-16 ***
## blueTotalGold 6.698e-04 1.958e-05 34.205 < 2e-16 ***
## blueTotalJungleMinionsKilled 8.375e-03 2.570e-03 3.259 0.00112 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12476.6 on 8999 degrees of freedom
## Residual deviance: 9828.1 on 8996 degrees of freedom
## AIC: 9836.1
##
## Number of Fisher Scoring iterations: 4
#scatterplot to test linearity
lrm1_predictors <- colnames(lol_train)
lrm1_probabilities <- predict(lrm1_rev_1, type = "response")
lrm1_plot <- lol_train %>%
mutate(logit = log(lrm1_probabilities/(1-lrm1_probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
ggplot(lrm1_plot, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
#prediction probability
lrm1_fit <- predict(lrm1_rev_1, newdata = lol_test, type = 'response')
lrm1_pred <- ifelse(lrm1_fit > 0.5,1,0)
lrm1_prob <- 1- mean(lrm1_pred != lol_test$blueWins)
lrm1_prob
## [1] 0.698521
#probability is 0.699, which is good
#ROC curve & AUC value
lrm1_pred_per <- prediction(lrm1_fit, lol_test$blueWins)
lrm1_per <- performance(lrm1_pred_per, measure = "tpr", x.measure = "fpr")
plot(lrm1_per)
#ROC curve is pretty convex
auc <- performance(lrm1_pred_per, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7826128
#auc value is 0.78, which means model fits good for test set
#Hosmer-Lemeshow goodness of fit
hoslem.test(lol_train$blueWins, fitted(lrm1_rev_1))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: lol_train$blueWins, fitted(lrm1_rev_1)
## X-squared = 4.9403, df = 8, p-value = 0.7639
#p value is 0.7639, which shows the model is a good fit
lol_2 <- data.frame(read.csv('blueandreddata.csv'))
lol_2_train <- lol_2[1:9000,]
lol_2_test <- lol_2[9001:9879,]
#multicollinearity
lol_m2 <- cor(lol_2)
corrplot(lol_m2, method = "circle")
#same issue with the blue only data; bluedeaths=redkills & bluefirstblood=no redfirstblood, vice versa
#stepwise regression
lrm2 <- glm(blueWins ~ .,data = lol_2_train, family = 'binomial') %>% stepAIC(trace = F)
summary(lrm2)
##
## Call:
## glm(formula = blueWins ~ blueWardsPlaced + blueFirstBlood + blueKills +
## blueTotalGold + blueTotalExperience + blueTotalJungleMinionsKilled +
## redWardsPlaced + redTotalGold + redTotalExperience + redTotalMinionsKilled,
## family = "binomial", data = lol_2_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5708 -0.8867 0.1198 0.8810 2.6806
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.136e-01 8.227e-01 0.381 0.7031
## blueWardsPlaced -1.938e-03 1.369e-03 -1.416 0.1569
## blueFirstBlood 1.373e-01 5.404e-02 2.540 0.0111 *
## blueKills 3.435e-02 2.195e-02 1.565 0.1176
## blueTotalGold 3.827e-04 5.017e-05 7.627 2.40e-14 ***
## blueTotalExperience 2.056e-04 3.970e-05 5.178 2.25e-07 ***
## blueTotalJungleMinionsKilled 9.055e-03 3.071e-03 2.948 0.0032 **
## redWardsPlaced -2.006e-03 1.348e-03 -1.488 0.1368
## redTotalGold -4.113e-04 3.101e-05 -13.265 < 2e-16 ***
## redTotalExperience -2.780e-04 3.843e-05 -7.234 4.70e-13 ***
## redTotalMinionsKilled 3.754e-03 1.471e-03 2.551 0.0107 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12476.6 on 8999 degrees of freedom
## Residual deviance: 9582.1 on 8989 degrees of freedom
## AIC: 9604.1
##
## Number of Fisher Scoring iterations: 4
#remove unsignificant variables
lrm2_rev <- glm(blueWins ~ blueTotalGold + blueTotalExperience
+ redTotalGold + redTotalExperience, data = lol_2_train, family = 'binomial')
summary(lrm2_rev)
##
## Call:
## glm(formula = blueWins ~ blueTotalGold + blueTotalExperience +
## redTotalGold + redTotalExperience, family = "binomial", data = lol_2_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5768 -0.8906 0.1113 0.8852 2.7099
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.527e-02 7.020e-01 -0.093 0.926
## blueTotalGold 4.297e-04 2.794e-05 15.379 < 2e-16 ***
## blueTotalExperience 2.399e-04 3.469e-05 6.914 4.71e-12 ***
## redTotalGold -4.220e-04 2.836e-05 -14.883 < 2e-16 ***
## redTotalExperience -2.429e-04 3.483e-05 -6.975 3.07e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12476.6 on 8999 degrees of freedom
## Residual deviance: 9610.3 on 8995 degrees of freedom
## AIC: 9620.3
##
## Number of Fisher Scoring iterations: 4
#scatterplot to test linearity
lrm2_predictors <- colnames(lol_2_train)
lrm2_probabilities <- predict(lrm2_rev, type = "response")
lrm2_plot <- lol_2_train %>%
mutate(logit = log(lrm2_probabilities/(1-lrm2_probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
ggplot(lrm2_plot, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
#prediction probability
lrm2_fit <- predict(lrm2_rev, newdata = lol_2_test, type = 'response')
lrm2_pred <- ifelse(lrm2_fit > 0.5,1,0)
lrm2_prob <- 1- mean(lrm2_pred != lol_2_test$blueWins)
lrm2_prob
## [1] 0.7133106
#probability is 0.713, which is good
#ROC curve & AUC value
lrm2_pred_per <- prediction(lrm2_fit, lol_2_test$blueWins)
lrm2_per <- performance(lrm2_pred_per, measure = "tpr", x.measure = "fpr")
plot(lrm2_per)
#ROC curve is pretty convex
auc_2 <- performance(lrm2_pred_per, measure = "auc")
auc_2 <- auc_2@y.values[[1]]
auc_2
## [1] 0.7881662
#auc value is 0.788, which means model fits good for test set
#Hosmer-Lemeshow goodness of fit
hoslem.test(lol_2_train$blueWins, fitted(lrm2_rev))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: lol_2_train$blueWins, fitted(lrm2_rev)
## X-squared = 6.9992, df = 8, p-value = 0.5367
#p value is 0.5367, which shows the model is a good fit
#test multicollinearity
car::vif(lrm1_rev_1)
## blueDeaths blueTotalGold
## 1.047434 1.013134
## blueTotalJungleMinionsKilled
## 1.034349
car::vif(lrm2_rev)
## blueTotalGold blueTotalExperience redTotalGold redTotalExperience
## 2.014777 2.008464 1.987081 1.969319
#no multicolinearity since no result is greater than 5
#likelihood ratio rest to compare blue & blue+red models
lrtest(lrm1_rev_1, lrm2_rev)
AIC/AUC/likelihood test all reveal that model with both teams’ variables are better than the one with just blue team, so model2 lrm2_rev is the final choice. Sumary write up is in the PowerPoint.