## install packages and libraries
#install.packages("tidyverse")
#install.packages("ggplot2")
#install.packages("corrplot")
Logistic Regression with Cricket Dataset
Motivation
Cricket is a game watched and played by billions of people across the world. Second in global popularity only to football (soccer), it is extremely popular in South Asia, Australia, Africa, and Europe.
About Dataset
The asia_cup data set includes data from each cricket match played in all Asia Cup Tournaments from 1984 (the first one) to 2022. The Asia Cup is a tournament that now takes place every two years, alternating host cities in different countries throughout Asia. Kaggle-link
Column | Description | Data type |
Team | Team playing | character |
Opponent | The team played against | character |
Host | Venue played at | character |
Year | Tournament year played | int |
Toss | Coin toss to determine who starts batting or bowling (0 = lost, 1 = won) | factor |
Selection | Team’s selection after winning / losing the toss (0 = Batting, 1 = Bowling) | factor |
Run Scored | Total runs scored by that team | int |
Fours | Total scored fours for that team | int |
Sixes | Total scored sixes for that team | int |
Extras | Amount of extra runs scored by that team | int |
Highest Score | Highest individual number of runs scored | int |
Result | lost match, 1 = won match | factor |
Given Extras | Number of extra runs given up | int |
Lets Code
library(corrplot)
corrplot 0.95 loaded
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
STEP 1 - DATA EXPLORE
# read the data
<- read.csv("cricket_asia_cup.csv")
data head(data)
Team Opponent Host Year Toss Selection Run.Scored Fours Sixes
1 Pakistan Sri Lanka Sharjah 1984 0 0 187 9 3
2 Sri Lanka Pakistan Sharjah 1984 1 1 190 11 1
3 India Sri Lanka Sharjah 1984 1 1 97 9 0
4 Sri Lanka India Sharjah 1984 0 0 96 7 0
5 India Pakistan Sharjah 1984 1 0 188 13 3
6 Pakistan India Sharjah 1984 0 1 134 5 0
Extras.Scored Highest.Score Result Given.Extras
1 21 47 0 26
2 26 57 1 21
3 14 51 1 8
4 8 38 0 14
5 17 56 1 5
6 5 35 0 17
str(data)
'data.frame': 252 obs. of 13 variables:
$ Team : chr "Pakistan" "Sri Lanka" "India" "Sri Lanka" ...
$ Opponent : chr "Sri Lanka" "Pakistan" "Sri Lanka" "India" ...
$ Host : chr "Sharjah" "Sharjah" "Sharjah" "Sharjah" ...
$ Year : int 1984 1984 1984 1984 1984 1984 1986 1986 1986 1986 ...
$ Toss : int 0 1 1 0 1 0 1 0 0 1 ...
$ Selection : int 0 1 1 0 0 1 1 0 0 1 ...
$ Run.Scored : int 187 190 97 96 188 134 116 197 94 98 ...
$ Fours : int 9 11 9 7 13 5 10 14 0 4 ...
$ Sixes : int 3 1 0 0 3 0 0 3 0 0 ...
$ Extras.Scored: int 21 26 14 8 17 5 14 15 9 5 ...
$ Highest.Score: int 47 57 51 38 56 35 34 39 37 47 ...
$ Result : int 0 1 1 0 1 0 0 1 0 1 ...
$ Given.Extras : int 26 21 8 14 5 17 15 14 5 9 ...
dim(data)
[1] 252 13
STEP 2 - DATA CLEANING
sum(duplicated(data)) # NO DUPLIACTES
[1] 0
sum(is.na(data)) # NO NULLS
[1] 0
colSums(is.na(data))
Team Opponent Host Year Toss
0 0 0 0 0
Selection Run.Scored Fours Sixes Extras.Scored
0 0 0 0 0
Highest.Score Result Given.Extras
0 0 0
Insights -The data is clean
STEP 3 - Manipulation - Data wrangling
## lets rename the columns for better understanding..
<- data %>%
data rename("Playing_Team" = "Team",
"Opponent_Team" = "Opponent",
"Host_Ground" = "Host",
"Toss_Won" = "Toss",
"Batting_First" = "Selection",
"Total_Runs" = "Run.Scored",
"Extras_Scored" = "Extras.Scored",
"Highest_Individual_Score" = "Highest.Score",
"Match_Result" = "Result",
"Extras_Given" = "Given.Extras")
colnames(data)
[1] "Playing_Team" "Opponent_Team"
[3] "Host_Ground" "Year"
[5] "Toss_Won" "Batting_First"
[7] "Total_Runs" "Fours"
[9] "Sixes" "Extras_Scored"
[11] "Highest_Individual_Score" "Match_Result"
[13] "Extras_Given"
# change the data type
$Batting_First <- as.factor(data$Batting_First)
data$Toss_Won <- as.factor(data$Toss_Won)
data$Match_Result <- as.factor(data$Match_Result)
data
glimpse(data)
Rows: 252
Columns: 13
$ Playing_Team <chr> "Pakistan", "Sri Lanka", "India", "Sri Lanka"…
$ Opponent_Team <chr> "Sri Lanka", "Pakistan", "Sri Lanka", "India"…
$ Host_Ground <chr> "Sharjah", "Sharjah", "Sharjah", "Sharjah", "…
$ Year <int> 1984, 1984, 1984, 1984, 1984, 1984, 1986, 198…
$ Toss_Won <fct> 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
$ Batting_First <fct> 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, …
$ Total_Runs <int> 187, 190, 97, 96, 188, 134, 116, 197, 94, 98,…
$ Fours <int> 9, 11, 9, 7, 13, 5, 10, 14, 0, 4, 0, 0, 15, 1…
$ Sixes <int> 3, 1, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 1, 4, 0, …
$ Extras_Scored <int> 21, 26, 14, 8, 17, 5, 14, 15, 9, 5, 19, 10, 9…
$ Highest_Individual_Score <int> 47, 57, 51, 38, 56, 35, 34, 39, 37, 47, 44, 4…
$ Match_Result <fct> 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
$ Extras_Given <int> 26, 21, 8, 14, 5, 17, 15, 14, 5, 9, 10, 19, 1…
STEP 4 - Describe and summary
# summary of all numeric varaibles
summary(data)
Playing_Team Opponent_Team Host_Ground Year
Length:252 Length:252 Length:252 Min. :1984
Class :character Class :character Class :character 1st Qu.:2000
Mode :character Mode :character Mode :character Median :2008
Mean :2007
3rd Qu.:2016
Max. :2022
Toss_Won Batting_First Total_Runs Fours Sixes
0:127 0:126 Min. : 38.0 Min. : 0.00 Min. : 0.000
1:125 1:126 1st Qu.:147.0 1st Qu.:10.00 1st Qu.: 1.000
Median :193.0 Median :15.00 Median : 2.000
Mean :201.9 Mean :15.63 Mean : 2.913
3rd Qu.:253.2 3rd Qu.:20.00 3rd Qu.: 4.250
Max. :385.0 Max. :41.00 Max. :14.000
Extras_Scored Highest_Individual_Score Match_Result Extras_Given
Min. : 0.00 Min. : 8.00 0:128 Min. : 0.00
1st Qu.: 8.00 1st Qu.: 47.00 1:124 1st Qu.: 8.00
Median :12.00 Median : 66.50 Median :12.00
Mean :13.24 Mean : 69.31 Mean :13.24
3rd Qu.:18.00 3rd Qu.: 85.00 3rd Qu.:18.00
Max. :38.00 Max. :183.00 Max. :38.00
Insights -
This gives us interesting insights like the min and max runs scored over the period of time. The min and max number of fours and sixes ever scored. The highest score an individual has ever scored is 183, etc
STEP 5 - Data Visualization
# Bar Plot of Wins vs. Losses – To check class imbalance
%>%
data ggplot(aes(Match_Result)) +
geom_bar(colour = "black", fill = "steelblue") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
scale_x_discrete(labels = c("0" = "Loss", "1" = "Win")) +
labs(title = "Wins vs. Losses", x = "Match Result", y = "Count") +
theme_minimal()
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.
# the dataset is fairly balanced
#Box Plots of Run.Scored, Fours, Sixes vs. Result – To see how these features vary between wins and losses.
# Reshape data for boxplots
<- data %>%
data_long pivot_longer(cols = c(Total_Runs, Fours, Sixes),
names_to = "Metric",
values_to = "Value")
print(data_long)
# A tibble: 756 × 12
Playing_Team Opponent_Team Host_Ground Year Toss_Won Batting_First
<chr> <chr> <chr> <int> <fct> <fct>
1 Pakistan Sri Lanka Sharjah 1984 0 0
2 Pakistan Sri Lanka Sharjah 1984 0 0
3 Pakistan Sri Lanka Sharjah 1984 0 0
4 Sri Lanka Pakistan Sharjah 1984 1 1
5 Sri Lanka Pakistan Sharjah 1984 1 1
6 Sri Lanka Pakistan Sharjah 1984 1 1
7 India Sri Lanka Sharjah 1984 1 1
8 India Sri Lanka Sharjah 1984 1 1
9 India Sri Lanka Sharjah 1984 1 1
10 Sri Lanka India Sharjah 1984 0 0
# ℹ 746 more rows
# ℹ 6 more variables: Extras_Scored <int>, Highest_Individual_Score <int>,
# Match_Result <fct>, Extras_Given <int>, Metric <chr>, Value <int>
# Boxplot - between runs, fours and sixes wrt the match result
%>%
data_long ggplot(aes(x = Metric, y = Value, fill = Match_Result)) +
geom_boxplot() +
labs(title = "Boxplot of Runs, Fours, and Sixes by Match Result",
x = "Metric",
y = "Value") +
scale_x_discrete(labels = c("Total_Runs" = "Total Runs", "Fours" = "Fours", "Sixes" = "Sixes")) +
theme_minimal()
# Select numeric variables
<- data %>% select(Total_Runs, Fours, Sixes,
numeric_data
Extras_Scored, Highest_Individual_Score, Extras_Given)
# Compute the correlation matrix
<- cor(numeric_data) cor_matrix
# Plot the heatmap with correlation numbers inside the boxes
corrplot(cor_matrix, method = "color",
addCoef.col = "black", # Add correlation numbers inside the boxes
number.cex = 0.7, # Adjust number size for visibility
tl.col = "black", # Keep default variable names
tl.srt = 45, # Rotate the variable names for readability
diag = FALSE) # Optionally hide the diagonal
Insights - Total runs are pretty highly correlated with number of Fours anf Highest individual score.
#Scatter Plot (Run.Scored vs. Given.Extras, colored by Result) – To spot trends in performance.
%>%
data ggplot(aes(x = Total_Runs, y = Extras_Given, color = Match_Result)) +
geom_point() +
labs(title = "Scatter Plot of Runs vs. Extras Given by Match Result",
x = "Total Runs",
y = "Extras Given") +
scale_color_manual(values = c("red", "green")) +
theme_minimal()
#Toss Decision Impact (Bar Chart of Toss vs. Result) – To assess whether winning the toss influences match outcome.
%>%
data ggplot(aes(x = Toss_Won, fill = Match_Result)) +
geom_bar(position = "dodge", colour = "black") +
scale_x_discrete(labels = c("0" = "Toss Lost", "1" = "Toss Won")) +
labs(title = "Toss Decision vs. Match Result", x = "Toss Result", y = "Count") +
theme_minimal()
When a team losing a toss the probability of winning decreases compared to when they win a toss
STEP 6 - Logistic Regression
# create training and testing datasets
set.seed(123)
<- sample(1:nrow(data), 0.8*nrow(data)) row_number
<- data[row_number,]
train dim(train)
[1] 201 13
<- data[-row_number,]
test dim(test)
[1] 51 13
Model 1
# create a simple logistic regression model
<- glm(Match_Result ~ Total_Runs,
model data = train,
family = binomial)
summary(model)
Call:
glm(formula = Match_Result ~ Total_Runs, family = binomial, data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.083988 0.499787 -4.170 3.05e-05 ***
Total_Runs 0.010099 0.002375 4.251 2.12e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 278.52 on 200 degrees of freedom
Residual deviance: 258.17 on 199 degrees of freedom
AIC: 262.17
Number of Fisher Scoring iterations: 4
Insights - the AIC and residuals must be low - lower AIC (Akaike Information Criterion) and lower residual deviance generally indicate a better-fitting model.
# Use the model to make predictions on the test data
<- predict(model, newdata = test, type = "response")
pred_probs
# Convert predicted probabilities to binary values (0 or 1) based on a threshold of 0.5
<- ifelse(pred_probs > 0.5, 1, 0) predictions
# Create a confusion matrix to evaluate model performance
<- table(Predicted = predictions, Actual = test$Match_Result)
confusion_matrix print(confusion_matrix)
Actual
Predicted 0 1
0 15 11
1 10 15
# R2 SCORE
# Round the calculated value and print it with the message
print(paste("R2 SCORE VALUE:",
round((15 + 15) / (15 + 15 + 11 + 10) * 100, 2), "%"))
[1] "R2 SCORE VALUE: 58.82 %"
Insights -
The confusion matrix describes that true positives and true negatives are 15 each the means the model correctly predicts the number of wins and loss.
On the other hand, the False negatives are 11, i.e the team won the match but was predicted as lost and the False postives show 10, i.e the team lost but still predicts win.
For our dataset, having a low FN and FP is both essential.
Lets try other variables.
STEP 7 - Multiple Logistic Regression
Model 2
<- glm(Match_Result ~ Total_Runs + Fours + Sixes +
model_all + Highest_Individual_Score +
Extras_Scored + Toss_Won + Batting_First,
Extras_Given data = train, family = binomial)
summary(model_all)
Call:
glm(formula = Match_Result ~ Total_Runs + Fours + Sixes + Extras_Scored +
Highest_Individual_Score + Extras_Given + Toss_Won + Batting_First,
family = binomial, data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.7848034 0.6806684 -4.091 4.29e-05 ***
Total_Runs -0.0001022 0.0051681 -0.020 0.9842
Fours 0.0411909 0.0357524 1.152 0.2493
Sixes 0.0626523 0.0756731 0.828 0.4077
Extras_Scored -0.0276529 0.0235280 -1.175 0.2399
Highest_Individual_Score 0.0210581 0.0091853 2.293 0.0219 *
Extras_Given 0.0131937 0.0249865 0.528 0.5975
Toss_Won1 0.9754408 0.3275128 2.978 0.0029 **
Batting_First1 0.4253359 0.3398583 1.252 0.2107
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 278.52 on 200 degrees of freedom
Residual deviance: 233.20 on 192 degrees of freedom
AIC: 251.2
Number of Fisher Scoring iterations: 4
## the AIC and Residual has reduced when we included all the variables
# Use the model to make predictions on the test data
<- predict(model_all, newdata = test, type = "response")
pred_probs_all
# Convert predicted probabilities to binary values (0 or 1) based on a threshold of 0.5
<- ifelse(pred_probs_all > 0.5, 1, 0) predictions_all
# Create a confusion matrix to evaluate model performance
<- table(Predicted = predictions_all, Actual = test$Match_Result)
confusion_matrix_all print(confusion_matrix_all)
Actual
Predicted 0 1
0 14 9
1 11 17
# R2 SCORE
# Round the calculated value and print it with the message
print(paste("R2 SCORE VALUE:",
round((14 + 17) / (14 + 17 + 11 + 9) * 100, 2), "%"))
[1] "R2 SCORE VALUE: 60.78 %"
# increase in R2 score value
Model3
# we will not consider the highly corrleated varaibles for this model
# total runs, fours and highest individual score are highly correlated
<- glm(Match_Result ~ Total_Runs + Sixes +
model_new +
Extras_Scored + Toss_Won + Batting_First,
Extras_Given data = train, family = binomial)
summary(model_new)
Call:
glm(formula = Match_Result ~ Total_Runs + Sixes + Extras_Scored +
Extras_Given + Toss_Won + Batting_First, family = binomial,
data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.818342 0.666696 -4.227 2.36e-05 ***
Total_Runs 0.010328 0.003021 3.419 0.000628 ***
Sixes 0.086806 0.073585 1.180 0.238127
Extras_Scored -0.039929 0.022707 -1.758 0.078674 .
Extras_Given 0.022382 0.023922 0.936 0.349470
Toss_Won1 0.828498 0.313693 2.641 0.008264 **
Batting_First1 0.575936 0.329360 1.749 0.080351 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 278.52 on 200 degrees of freedom
Residual deviance: 240.38 on 194 degrees of freedom
AIC: 254.38
Number of Fisher Scoring iterations: 4
# Use the model to make predictions on the test data
<- predict(model_new, newdata = test, type = "response")
pred_probs_new
# Convert predicted probabilities to binary values (0 or 1) based on a threshold of 0.5
<- ifelse(pred_probs_new > 0.5, 1, 0) predictions_new
# Create a confusion matrix to evaluate model performance
<- table(Predicted = predictions_new, Actual = test$Match_Result)
confusion_matrix_new print(confusion_matrix_new)
Actual
Predicted 0 1
0 15 6
1 10 20
# Round the calculated value and print it with the message
print(paste("R2 SCORE VALUE:",
round((15 + 20) / (15 + 20 + 6 + 10) * 100, 2), "%"))
[1] "R2 SCORE VALUE: 68.63 %"
# the R2 score incraesed to 68.63%
We can plot ROC curve to find if a different threshold value can help us increase the model accuracy. Or we can use other better regression models too.
In order to access this code in R, please check the [Github](GitHub Repository) repository.