Logit Analysis

Reading Data into R Environment

# reading rda file
load("IPLData.rda")
dim(IPLData)

## [1] 612  14

Building Logistic Regression Model

# building the model
Model1 <- glm(Won ~ TossWon
                  + BatFrist
                  + HomeMatch
                  + PPRuns
                  + PPWickets
                  + FourCount
                  + SixCount
                  + WicketsLost
                  + TotelRuns
                  + Year
                  + Team,
            data = IPLData, family = binomial())
# summary of the model
summary(Model1)

## 
## Call:
## glm(formula = Won ~ TossWon + BatFrist + HomeMatch + PPRuns + 
##     PPWickets + FourCount + SixCount + WicketsLost + TotelRuns + 
##     Year + Team, family = binomial(), data = IPLData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3241  -0.8824  -0.2143   0.8511   2.2801  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.765059   0.872891   0.876   0.3808    
## TossWon1      0.175730   0.311800   0.564   0.5730    
## BatFrist1    -0.352356   0.315292  -1.118   0.2638    
## HomeMatch1    0.802882   0.205678   3.904 9.48e-05 ***
## PPRuns        0.022711   0.010665   2.129   0.0332 *  
## PPWickets    -0.025834   0.113906  -0.227   0.8206    
## FourCount     0.072760   0.037190   1.956   0.0504 .  
## SixCount      0.042510   0.047148   0.902   0.3673    
## WicketsLost  -0.423578   0.052777  -8.026 1.01e-15 ***
## TotelRuns    -0.003345   0.006855  -0.488   0.6256    
## YearB2018     0.218858   0.316903   0.691   0.4898    
## YearC2017     0.262339   0.350292   0.749   0.4539    
## YearD2016     0.193095   0.351213   0.550   0.5825    
## YearE2015     0.267262   0.372516   0.717   0.4731    
## YearF2014     0.449776   0.324002   1.388   0.1651    
## TeamDC       -0.999901   0.434966  -2.299   0.0215 *  
## TeamGL       -0.859330   0.618370  -1.390   0.1646    
## TeamKKR      -0.544720   0.442733  -1.230   0.2186    
## TeamKXIP     -0.700140   0.436910  -1.602   0.1090    
## TeamMI       -0.329963   0.436539  -0.756   0.4497    
## TeamRCB      -0.882062   0.439100  -2.009   0.0446 *  
## TeamRPS      -0.701445   0.608046  -1.154   0.2487    
## TeamRR       -0.553484   0.473415  -1.169   0.2424    
## TeamSH       12.541240 882.743514   0.014   0.9887    
## TeamSPS      11.577445 882.743529   0.013   0.9895    
## TeamSRH      -0.329923   0.442357  -0.746   0.4558    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 848.41  on 611  degrees of freedom
## Residual deviance: 654.29  on 586  degrees of freedom
## AIC: 706.29
## 
## Number of Fisher Scoring iterations: 13

Mixed Effect Logistic Regression Model

library(lme4)

## Loading required package: Matrix

Model2 <- glmer(Won ~ TossWon
                    + BatFrist
                    + HomeMatch
                    + PPRuns
                    + PPWickets
                    + FourCount
                    + SixCount
                    + WicketsLost
                    + TotelRuns
                    + Year
                    + (1 | Team), 
                     data = IPLData, family = binomial, 
                     control = glmerControl(optimizer = "bobyqa"), nAGQ = 1)

## boundary (singular) fit: see ?isSingular

# print the model results without correlations among fixed effects
print(summary(Model2), corr = TRUE)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## Won ~ TossWon + BatFrist + HomeMatch + PPRuns + PPWickets + FourCount +  
##     SixCount + WicketsLost + TotelRuns + Year + (1 | Team)
##    Data: IPLData
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    695.6    766.2   -331.8    663.6      596 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9607 -0.7157 -0.1722  0.6838  3.2551 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Team   (Intercept) 0        0       
## Number of obs: 612, groups:  Team, 12
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.267304   0.819656   0.326 0.744336    
## TossWon1     0.153016   0.306873   0.499 0.618041    
## BatFrist1   -0.315705   0.310701  -1.016 0.309579    
## HomeMatch1   0.747774   0.200822   3.724 0.000196 ***
## PPRuns       0.022951   0.010443   2.198 0.027966 *  
## PPWickets   -0.029972   0.113054  -0.265 0.790924    
## FourCount    0.060284   0.036544   1.650 0.099019 .  
## SixCount     0.025291   0.045808   0.552 0.580878    
## WicketsLost -0.434106   0.052018  -8.345  < 2e-16 ***
## TotelRuns   -0.001389   0.006775  -0.205 0.837603    
## YearB2018    0.219835   0.314966   0.698 0.485199    
## YearC2017    0.139841   0.324555   0.431 0.666562    
## YearD2016    0.036444   0.325688   0.112 0.910904    
## YearE2015    0.255970   0.369062   0.694 0.487952    
## YearF2014    0.393555   0.321133   1.226 0.220380    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## Correlation of fixed effects could have been required in summary()

## 
## Correlation of Fixed Effects:
##             (Intr) TssWn1 BtFrs1 HmMtc1 PPRuns PPWckt ForCnt SixCnt WcktsL
## TossWon1    -0.353                                                        
## BatFrist1   -0.275  0.763                                                 
## HomeMatch1  -0.019 -0.033 -0.140                                          
## PPRuns      -0.291 -0.031  0.090  0.003                                   
## PPWickets   -0.276 -0.052  0.025 -0.062  0.359                            
## FourCount    0.206 -0.068 -0.038  0.018 -0.164 -0.051                     
## SixCount     0.286 -0.100 -0.060 -0.022 -0.003 -0.063  0.532              
## WicketsLost -0.219 -0.036 -0.089 -0.053 -0.204 -0.362  0.096  0.155       
## TotelRuns   -0.549  0.109 -0.018 -0.031 -0.161  0.063 -0.699 -0.743 -0.024
## YearB2018   -0.106  0.011  0.010  0.052 -0.041 -0.061  0.066 -0.019 -0.048
## YearC2017   -0.071  0.001 -0.004  0.036 -0.085 -0.097  0.080  0.101 -0.025
## YearD2016   -0.177 -0.012 -0.008  0.016  0.031 -0.007  0.090  0.158  0.052
## YearE2015   -0.172 -0.008 -0.005  0.020  0.048  0.000  0.028  0.098 -0.017
## YearF2014   -0.209 -0.020 -0.031  0.150  0.081 -0.009  0.156  0.169  0.019
##             TtlRns YB2018 YC2017 YD2016 YE2015
## TossWon1                                      
## BatFrist1                                     
## HomeMatch1                                    
## PPRuns                                        
## PPWickets                                     
## FourCount                                     
## SixCount                                      
## WicketsLost                                   
## TotelRuns                                     
## YearB2018   -0.056                            
## YearC2017   -0.094  0.492                     
## YearD2016   -0.115  0.469  0.472              
## YearE2015   -0.050  0.414  0.415  0.423       
## YearF2014   -0.149  0.487  0.484  0.502  0.437
## convergence code: 0
## boundary (singular) fit: see ?isSingular

AIC, BIC and Log-likelihood test for the Model1 & Model2

AIC(Model1, Model2)

##        df      AIC
## Model1 26 706.2949
## Model2 16 695.5801

BIC(Model1, Model2)

##        df      BIC
## Model1 26 821.1299
## Model2 16 766.2479

# Log-likelihood test for model1
-2*logLik(Model1)

## 'log Lik.' 654.2949 (df=26)

# Log-likelihood test for model2
-2*logLik(Model2)

## 'log Lik.' 663.5801 (df=16)

Logistic regression Assumptions

The logistic regression method assumes that:

The outcome is a binary or dichotomous variable like yes vs no, positive vs negative, 1 vs 0.
There is a linear relationship between the logit of the outcome and each predictor variables. Recall that the logit function is logit(p) = log(p/(1-p)), where p is the probabilities of the outcome.
There is no influential values (extreme values or outliers) in the continuous predictors
There is no high intercorrelations (i.e. multicollinearity) among the predictors.

Linearity Assumption

library(tidyverse)
library(broom)
# Predict the probability 
probabilities <- predict(Model2, type = "response")
logit <- log(probabilities/(1-probabilities))
mydata <- IPLData %>%
  dplyr::select_if(is.numeric) 
predictors <- colnames(mydata)
mydata <- mydata %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)
# Create the scatter plots:
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

Multicollinearity

car::vif(Model1)

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

##                 GVIF Df GVIF^(1/(2*Df))
## TossWon     2.658700  1        1.630552
## BatFrist    2.719590  1        1.649118
## HomeMatch   1.121307  1        1.058918
## PPRuns      1.617735  1        1.271902
## PPWickets   1.397512  1        1.182164
## FourCount   2.534929  1        1.592146
## SixCount    2.746477  1        1.657250
## WicketsLost 1.277906  1        1.130445
## TotelRuns   4.349602  1        2.085570
## Year        1.678021  5        1.053125
## Team        1.750530 11        1.025777

car::vif(Model2)

##                 GVIF Df GVIF^(1/(2*Df))
## TossWon     2.615618  1        1.617287
## BatFrist    2.683231  1        1.638057
## HomeMatch   1.087224  1        1.042700
## PPRuns      1.582959  1        1.258157
## PPWickets   1.388087  1        1.178171
## FourCount   2.468067  1        1.571008
## SixCount    2.630357  1        1.621837
## WicketsLost 1.251017  1        1.118488
## TotelRuns   4.353196  1        2.086431
## Year        1.220991  5        1.020167

INFERENCE

Probability of winning for HomeMatch

attach(IPLData)
# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbF1 <- predict(Model2, Fdata, type = "response")
ProbF1

##         1 
## 0.5420654

Probability of winning for non-HomeMatch

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "0",
PPRuns = mean(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbF2 <- predict(Model2, Fdata, type = "response")
ProbF2

##         1 
## 0.3591366

Summary

homeMatch <- c("0", "1")
Probability <- c(round(ProbF2,2),round(ProbF1,2))
cbind(homeMatch, Probability)

##   homeMatch Probability
## 1 "0"       "0.36"     
## 1 "1"       "0.54"

Probability of Winning with (mean - sd) PPRuns

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns)- sd(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbG1 <- predict(Model2, Fdata, type = "response")
ProbG1

##         1 
## 0.4732919

Probability of Winning with (mean + sd) PPRuns

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns) + sd(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbG2 <- predict(Model2, Fdata, type = "response")
ProbG2

##         1 
## 0.6092723

Summary

PowerPlayRuns <- c("mean-sd", "mean", "mean + sd")
probability <-  c(round(ProbG1,2),round(ProbF1,2),round(ProbG2,2))
cbind(PowerPlayRuns, probability)

##   PowerPlayRuns probability
## 1 "mean-sd"     "0.47"     
## 1 "mean"        "0.54"     
## 1 "mean + sd"   "0.61"

Probability of Winning with (mean - sd) FourCount

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount)- sd(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbH1 <- predict(Model2, Fdata, type = "response")
ProbH1

##         1 
## 0.4783956

Probability of Winning with (mean + sd) FourCount

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns) ,
PPWickets = mean(PPWickets),
FourCount = mean(FourCount) + sd(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbH2 <- predict(Model2, Fdata, type = "response")
ProbH2

##         1 
## 0.6043903

Summary

FourCounts <- c("mean-sd", "mean", "mean + sd")
probability <-  c(round(ProbH1,2),round(ProbF1,2),round(ProbH2,2))
cbind(FourCounts, probability)

##   FourCounts  probability
## 1 "mean-sd"   "0.48"     
## 1 "mean"      "0.54"     
## 1 "mean + sd" "0.6"

Probability of Winning with (mean - sd) WicketsLost

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns),
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost) - sd(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbI1 <- predict(Model2, Fdata, type = "response")
ProbI1

##         1 
## 0.7710066

Probability of Winning with (mean + sd) WicketsLost

# creating single value dataframe
Fdata <- data.frame(
TossWon = "1",
BatFrist = "1",
HomeMatch = "1",
PPRuns = mean(PPRuns) ,
PPWickets = mean(PPWickets),
FourCount = mean(FourCount),
SixCount = mean(SixCount),
WicketsLost = mean(WicketsLost)+ sd(WicketsLost),
TotelRuns = mean(TotelRuns),
Year = "A2019",
Team = "KKR")

# predicting probability for female attrition
ProbI2 <- predict(Model2, Fdata, type = "response")
ProbI2

##         1 
## 0.2938655

Summary

WicketLost <- c("mean-sd", "mean", "mean + sd")
probability <-  c(round(ProbI1,2),round(ProbF1,2),round(ProbI2,2))
cbind(WicketLost, probability)

##   WicketLost  probability
## 1 "mean-sd"   "0.77"     
## 1 "mean"      "0.54"     
## 1 "mean + sd" "0.29"

MBASkills.In

Data Analytics Skills (for MBAs) using R programming

Logit Analysis

Reading Data into R Environment

Building Logistic Regression Model

Mixed Effect Logistic Regression Model

AIC, BIC and Log-likelihood test for the Model1 & Model2

Logistic regression Assumptions

Linearity Assumption

Multicollinearity

INFERENCE

Probability of winning for HomeMatch

Probability of winning for non-HomeMatch

Summary

Probability of Winning with (mean - sd) PPRuns

Probability of Winning with (mean + sd) PPRuns

Summary

Probability of Winning with (mean - sd) FourCount

Probability of Winning with (mean + sd) FourCount

Summary

Probability of Winning with (mean - sd) WicketsLost

Probability of Winning with (mean + sd) WicketsLost

Summary