# reading the csv data file
IPLData <- read.csv("IPL Data (Prob of win).csv")
# number of rows and columns
dim(IPLData)
## [1] 438 14
# column names
colnames(IPLData)
## [1] "Year" "City" "Match.No" "Team" "Won"
## [6] "TossWon" "BatFrist" "HomeMatch" "PPRuns" "PPWickets"
## [11] "FourCount" "SixCount" "WicketsLost" "TotelRuns"
# structure of the dataframe
str(IPLData)
## 'data.frame': 438 obs. of 14 variables:
## $ Year : Factor w/ 4 levels "A2019","B2018",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ City : Factor w/ 15 levels "Bengaluru","Chandigarh",..: 5 5 12 12 14 14 6 6 1 1 ...
## $ Match.No : int 1 1 2 2 3 3 4 4 5 5 ...
## $ Team : Factor w/ 12 levels "CSK","DC","GL",..: 10 7 6 11 3 4 8 5 7 2 ...
## $ Won : int 1 0 0 1 0 1 0 1 1 0 ...
## $ TossWon : int 0 1 0 1 0 1 0 1 1 0 ...
## $ BatFrist : int 1 0 1 0 1 0 1 0 1 0 ...
## $ HomeMatch : int 1 0 0 1 0 0 0 0 1 0 ...
## $ PPRuns : int 59 54 61 59 52 73 50 50 41 43 ...
## $ PPWickets : int 1 1 1 1 1 0 1 1 2 2 ...
## $ FourCount : int 17 15 12 18 18 18 11 11 14 10 ...
## $ SixCount : int 9 6 11 6 8 8 7 7 7 5 ...
## $ WicketsLost: int 4 10 8 3 4 0 6 4 8 9 ...
## $ TotelRuns : int 207 172 184 187 183 184 163 164 157 142 ...
# converting multiple numeric variables into factor variable
names <- c(1,5:8)
IPLData[,names] <- lapply(IPLData[,names] , factor)
# ordering the levels of a factor variable
# IPLData$Year <- ordered(IPLData$Year, levels = c("2019", "2018", "2017","2016"))
# structure of the dataframe
str(IPLData)
## 'data.frame': 438 obs. of 14 variables:
## $ Year : Factor w/ 4 levels "A2019","B2018",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ City : Factor w/ 15 levels "Bengaluru","Chandigarh",..: 5 5 12 12 14 14 6 6 1 1 ...
## $ Match.No : int 1 1 2 2 3 3 4 4 5 5 ...
## $ Team : Factor w/ 12 levels "CSK","DC","GL",..: 10 7 6 11 3 4 8 5 7 2 ...
## $ Won : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 1 2 2 1 ...
## $ TossWon : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 2 1 ...
## $ BatFrist : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
## $ HomeMatch : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 2 1 ...
## $ PPRuns : int 59 54 61 59 52 73 50 50 41 43 ...
## $ PPWickets : int 1 1 1 1 1 0 1 1 2 2 ...
## $ FourCount : int 17 15 12 18 18 18 11 11 14 10 ...
## $ SixCount : int 9 6 11 6 8 8 7 7 7 5 ...
## $ WicketsLost: int 4 10 8 3 4 0 6 4 8 9 ...
## $ TotelRuns : int 207 172 184 187 183 184 163 164 157 142 ...
library(psych)
describe(IPLData)[,c(2:5,8:9)]
## n mean sd median min max
## Year* 438 2.47 1.11 2.0 1 4
## City* 438 7.17 4.01 7.0 1 15
## Match.No 438 28.07 16.05 28.0 1 56
## Team* 438 5.80 3.17 5.5 1 12
## Won* 438 1.50 0.50 1.0 1 2
## TossWon* 438 1.50 0.50 1.5 1 2
## BatFrist* 438 1.50 0.50 1.5 1 2
## HomeMatch* 438 1.45 0.50 1.0 1 2
## PPRuns 438 48.97 11.92 49.0 15 105
## PPWickets 438 1.34 1.07 1.0 0 5
## FourCount 438 13.82 4.19 14.0 3 31
## SixCount 438 6.32 3.56 6.0 0 20
## WicketsLost 438 5.76 2.44 6.0 0 10
## TotelRuns 438 162.01 31.91 164.0 41 248
summary(IPLData)
## Year City Match.No Team Won TossWon
## A2019:112 Kolkata : 58 Min. : 1.00 KXIP : 56 0:220 0:219
## B2018:112 Delhi : 54 1st Qu.:14.00 MI : 56 1:218 1:219
## C2017:110 Hyderabad: 54 Median :28.00 RCB : 55
## D2016:104 Bengaluru: 52 Mean :28.07 DC : 54
## Mumbai : 50 3rd Qu.:42.00 KKR : 54
## Pune : 34 Max. :56.00 SRH : 52
## (Other) :136 (Other):111
## BatFrist HomeMatch PPRuns PPWickets FourCount
## 0:219 0:242 Min. : 15.00 Min. :0.000 Min. : 3.00
## 1:219 1:196 1st Qu.: 41.00 1st Qu.:1.000 1st Qu.:11.00
## Median : 49.00 Median :1.000 Median :14.00
## Mean : 48.97 Mean :1.345 Mean :13.82
## 3rd Qu.: 57.00 3rd Qu.:2.000 3rd Qu.:17.00
## Max. :105.00 Max. :5.000 Max. :31.00
##
## SixCount WicketsLost TotelRuns
## Min. : 0.00 Min. : 0.000 Min. : 41.0
## 1st Qu.: 4.00 1st Qu.: 4.000 1st Qu.:145.2
## Median : 6.00 Median : 6.000 Median :164.0
## Mean : 6.32 Mean : 5.758 Mean :162.0
## 3rd Qu.: 8.00 3rd Qu.: 8.000 3rd Qu.:181.8
## Max. :20.00 Max. :10.000 Max. :248.0
##
# building the model
Model1 <- glm(Won ~ TossWon
+ BatFrist
+ HomeMatch
+ PPRuns
+ PPWickets
+ FourCount
+ SixCount
+ WicketsLost
+ TotelRuns
+ Year
+ Team,
data = IPLData, family = binomial())
# summary of the model
summary(Model1)
##
## Call:
## glm(formula = Won ~ TossWon + BatFrist + HomeMatch + PPRuns +
## PPWickets + FourCount + SixCount + WicketsLost + TotelRuns +
## Year + Team, family = binomial(), data = IPLData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2096 -0.8817 -0.2200 0.8023 2.5208
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.236521 1.062956 1.163 0.244714
## TossWon1 0.180688 0.320503 0.564 0.572916
## BatFrist1 -0.546096 0.326769 -1.671 0.094682 .
## HomeMatch1 0.854238 0.241622 3.535 0.000407 ***
## PPRuns 0.014309 0.012796 1.118 0.263454
## PPWickets -0.032909 0.139162 -0.236 0.813059
## FourCount 0.106101 0.046785 2.268 0.023339 *
## SixCount 0.115496 0.060711 1.902 0.057120 .
## WicketsLost -0.403997 0.062597 -6.454 1.09e-10 ***
## TotelRuns -0.008987 0.008670 -1.037 0.299940
## YearB2018 0.197266 0.321193 0.614 0.539105
## YearC2017 0.310452 0.360754 0.861 0.389478
## YearD2016 0.265715 0.366211 0.726 0.468097
## TeamDC -0.736414 0.569657 -1.293 0.196104
## TeamGL -0.968891 0.711015 -1.363 0.172981
## TeamKKR -0.922511 0.582714 -1.583 0.113392
## TeamKXIP -1.218000 0.577028 -2.111 0.034788 *
## TeamMI -0.303460 0.570386 -0.532 0.594708
## TeamRCB -1.074159 0.578840 -1.856 0.063495 .
## TeamRPS -0.778460 0.698206 -1.115 0.264875
## TeamRR -0.882995 0.639191 -1.381 0.167148
## TeamSH 12.532766 882.743599 0.014 0.988672
## TeamSPS 11.462631 882.743609 0.013 0.989640
## TeamSRH -0.253279 0.584689 -0.433 0.664880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 607.19 on 437 degrees of freedom
## Residual deviance: 455.55 on 414 degrees of freedom
## AIC: 503.55
##
## Number of Fisher Scoring iterations: 13
library(lme4)
## Loading required package: Matrix
Model2 <- glmer(Won ~ TossWon
+ BatFrist
+ HomeMatch
+ PPRuns
+ PPWickets
+ FourCount
+ SixCount
+ WicketsLost
+ TotelRuns
+ Year
+ (1 | Team),
data = IPLData, family = binomial,
control = glmerControl(optimizer = "bobyqa"), nAGQ = 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.0113589
## (tol = 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
# print the model results without correlations among fixed effects
print(summary(Model2), corr = TRUE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## Won ~ TossWon + BatFrist + HomeMatch + PPRuns + PPWickets + FourCount +
## SixCount + WicketsLost + TotelRuns + Year + (1 | Team)
## Data: IPLData
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 494.3 551.5 -233.2 466.3 424
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9991 -0.6550 -0.1764 0.6473 3.8540
##
## Random effects:
## Groups Name Variance Std.Dev.
## Team (Intercept) 0.005052 0.07108
## Number of obs: 438, groups: Team, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.733547 1.003974 0.731 0.464997
## TossWon1 0.210946 0.316997 0.665 0.505762
## BatFrist1 -0.499366 0.322256 -1.550 0.121239
## HomeMatch1 0.794848 0.234892 3.384 0.000715 ***
## PPRuns 0.011445 0.012477 0.917 0.358996
## PPWickets -0.042722 0.137850 -0.310 0.756621
## FourCount 0.096557 0.045686 2.114 0.034556 *
## SixCount 0.094762 0.058462 1.621 0.105038
## WicketsLost -0.413741 0.061367 -6.742 1.56e-11 ***
## TotelRuns -0.007134 0.008603 -0.829 0.406946
## YearB2018 0.217527 0.317939 0.684 0.493863
## YearC2017 0.211783 0.329044 0.644 0.519814
## YearD2016 0.107884 0.333012 0.324 0.745965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of fixed effects could have been required in summary()
##
## Correlation of Fixed Effects:
## (Intr) TssWn1 BtFrs1 HmMtc1 PPRuns PPWckt ForCnt SixCnt WcktsL
## TossWon1 -0.289
## BatFrist1 -0.202 0.652
## HomeMatch1 -0.019 -0.051 -0.170
## PPRuns -0.313 -0.064 0.124 -0.018
## PPWickets -0.285 -0.086 0.046 -0.060 0.395
## FourCount 0.242 -0.098 -0.053 0.039 -0.188 -0.071
## SixCount 0.347 -0.128 -0.091 0.046 -0.052 -0.100 0.610
## WicketsLost -0.256 -0.026 -0.097 -0.037 -0.191 -0.378 0.099 0.126
## TotelRuns -0.597 0.149 -0.017 -0.049 -0.110 0.076 -0.723 -0.779 0.005
## YearB2018 -0.052 0.010 0.008 0.053 -0.057 -0.072 0.072 -0.015 -0.053
## YearC2017 -0.010 -0.001 -0.019 0.057 -0.106 -0.120 0.113 0.147 -0.026
## YearD2016 -0.136 -0.028 -0.024 0.045 0.033 -0.013 0.131 0.209 0.061
## TtlRns YB2018 YC2017
## TossWon1
## BatFrist1
## HomeMatch1
## PPRuns
## PPWickets
## FourCount
## SixCount
## WicketsLost
## TotelRuns
## YearB2018 -0.063
## YearC2017 -0.129 0.489
## YearD2016 -0.152 0.454 0.481
## convergence code: 0
## Model failed to converge with max|grad| = 0.0113589 (tol = 0.001, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?