# Import the required libraries
library(tidyverse)
library(dplyr)
# Import the data from the file
data <- read.csv("~/Documents/Rdocs/matches.csv", stringsAsFactors = TRUE)
# Structure of the data
str(data)
'data.frame': 1095 obs. of 20 variables:
$ id : int 335982 335983 335984 335985 335986 335987 335988 335989 335990 335991 ...
$ season : Factor w/ 17 levels "2007/08","2009",..: 1 1 1 1 1 1 1 1 1 1 ...
$ city : Factor w/ 36 levels "Abu Dhabi","Ahmedabad",..: 3 8 11 27 24 19 17 9 17 8 ...
$ date : Factor w/ 823 levels "2008-04-18","2008-04-19",..: 1 2 2 3 3 4 5 6 7 8 ...
$ match_type : Factor w/ 8 levels "3rd Place Play-Off",..: 5 5 5 5 5 5 5 5 5 5 ...
$ player_of_match: Factor w/ 291 levels "A Chandila","A Kumble",..: 38 151 152 175 58 263 278 158 288 114 ...
$ venue : Factor w/ 58 levels "Arun Jaitley Stadium",..: 24 41 17 56 15 47 43 28 43 41 ...
$ team1 : Factor w/ 19 levels "Chennai Super Kings",..: 17 7 4 11 9 14 2 1 2 7 ...
$ team2 : Factor w/ 19 levels "Chennai Super Kings",..: 9 1 14 17 2 7 4 11 14 11 ...
$ toss_winner : Factor w/ 19 levels "Chennai Super Kings",..: 17 1 14 11 2 7 2 11 14 11 ...
$ toss_decision : Factor w/ 2 levels "bat","field": 2 1 1 1 1 1 1 2 2 2 ...
$ winner : Factor w/ 19 levels "Chennai Super Kings",..: 9 1 4 17 9 14 4 1 14 7 ...
$ result : Factor w/ 4 levels "no result","runs",..: 2 2 4 4 4 4 4 2 4 2 ...
$ result_margin : int 140 33 9 5 5 6 9 6 3 66 ...
$ target_runs : int 223 241 130 166 111 167 143 209 215 183 ...
$ target_overs : num 20 20 20 20 20 20 20 20 20 20 ...
$ super_over : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
$ method : Factor w/ 1 level "D/L": NA NA NA NA NA NA NA NA NA NA ...
$ umpire1 : Factor w/ 62 levels "A Deshmukh","A Nand Kishore",..: 8 35 6 52 11 6 25 19 8 6 ...
$ umpire2 : Factor w/ 62 levels "A Deshmukh","A Nand Kishore",..: 42 53 16 15 25 41 5 16 31 5 ...
Head of the data
# it gives first few rows of the data
head(data)
1. Grouping the data
a) Group by match type and season
# Lets group the data by match types and seasons.And calculate the average result margin,target overs and runs across the seasons in various match types.
match_type_season_summary <- data |>
group_by(match_type,season) |>
summarise(
Avg_result_margin = mean(result_margin),
Avg_target_overs = mean(target_overs),
Median_target_runs = median(target_runs)
)
`summarise()` has grouped output by 'match_type'. You can override using the `.groups` argument.
Lets create a visualization for better understanding.
# Here I am checking the average result margin of matches over seasons by match type.
match_type_season_densitymap <- ggplot(match_type_season_summary, aes(x = season, y = match_type, fill = Avg_result_margin)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Average Result Margin of matches over seasons by match_type",
x = "season",
y = "match_type",
fill = "Avg_result_margin") +
theme_minimal()
# Display the densitymap
print(match_type_season_densitymap)

Matches played in finals have been close encounters as the average
result margin is low compared to that of eliminators.
We can see that there are 2 close range matches and 2 moderate
matches in eliminator, but in finals only one match has high average
result margin.
b) Group by city and result
# Lets group the data by city and result. And check in which city , most of the matches are won by runs
city_result_summary <- data |>
group_by(city, result) |>
summarise(
Avg_result_margin = mean(result_margin),
Avg_target_overs = mean(target_overs),
Median_target_runs = median(target_runs)
)
`summarise()` has grouped output by 'city'. You can override using the `.groups` argument.
Lets create a visualization for better understanding.
# lets create a bubble graph for cities by results with Average result margin and Median target runs
city_result_graph <- ggplot(city_result_summary, aes(x = Avg_result_margin, y = Median_target_runs, size = Avg_target_overs, color = result)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 5)) +
labs(title = "Cities by results with Average result margin and Median target runs",
x = "Avg_result_margin ",
y = "Median_target_runs",
size = "Avg_target_overs",
color = "Genre") +
theme_minimal()
print(city_result_graph)
Warning: Removed 14 rows containing missing values or values outside the scale range (`geom_point()`).

We can say that there are more matches won by wickets in close range
matches for various cities than to that of runs.
We can also say that there are more matches with median target runs
around 150 end up winning by runs in various cities.
c) Group by result margin and result
result_margin_result_summary <- data |>
mutate(result_margin_range = cut(result_margin,breaks = c(0, 50, 100, 150, 200, Inf), labels = c("0-50", "50-100", "100-150", "150-200", "200-300"))) |>
group_by(result_margin_range, result) |>
summarise(
Avg_target_runs = mean(target_runs),
Avg_target_overs = mean(target_overs)
)
`summarise()` has grouped output by 'result_margin_range'. You can override using the `.groups` argument.
Lets calculate average target runs for result margin by result
#lets use bar graph for visualization
result_margin_result_bar <- ggplot(result_margin_result_summary, aes(x = result_margin_range, y = Avg_target_runs, fill = result)) +
geom_bar(stat = "identity") +
labs(title = "Average target runs for result margin by result",
x = "result_margin_range ",
y = " Avg_target_runs",
fill = "result") +
theme_minimal()
# Display the stacked bar chart
print(result_margin_result_bar)
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_bar()`).

From this, we can conclude that, most of the matches which are won
by runs have average target runs of 200.
Matches that were won by wickets are close range encounters as the
result margin is low.
Lowest Probabilities
lowest_probability <- data |>
group_by(season) |>
summarise(Probability = n() / nrow(data)) |>
arrange(Probability) |>
slice(1)
# Attach lowest probabilitity to the data
data <- data |>
mutate(Lowest_Probability = ifelse(season %in% lowest_probability$season, "Lowest Probability", "Normal"))
# Lets view the data to select two categorical values and find the probability for combination of them.
head(data)
Lets find probability of specific combinations.
But first lets check missing values of season and result values in
dataset to avoid ambiguity.
existing_combinations <- unique(select(data, season, result))
# Create a dataframe of all possible combinations of Genre and Year
all_combinations <- expand.grid(season = unique(data$season), result = unique(data$result))
# Find missing combinations
missing_combinations <- anti_join(all_combinations, existing_combinations)
Joining with `by = join_by(season, result)`
print(missing_combinations)
There are 21 missing values in this combination. Lets take
season-result pair and find their occurance.
# Lets create a group of season by result
combination_count <- data |>
group_by(season, result) |>
summarise(Count = n())
`summarise()` has grouped output by 'season'. You can override using the `.groups` argument.
# Lets arrange it to find the combinations.
combination_count <- combination_count |>
arrange(desc(Count))
print(combination_count)
Now we can find which combinations occur frequently and rarely.
From this we can say that, in 2012 there are 40 matches won by
wickets.And in 2015 there are only 24 matches won by wickets.

The most frequent combinations are 2023-runs and 2012-wickets.
Similarly, we can find lowest probable combination.
From this graph, we can also say that the lowest probable set is
2016-runs.
If we try to find a combination 2012-tie, we could not retrieve the
data.This may be because of multiple issues like the below.
Data collection Errors, Data Entry Errors, Logical constraints Ex:
non existance of that pair in real world.
In our case it is either of these issues.
