# Read the CSV file
my_data <- read.csv('C:/Users/dell/Downloads/Ball_By_Ball.csv')
summary(my_data)
## MatcH_id Over_id Ball_id Innings_No
## Min. : 335987 Min. : 1.00 Min. :1.000 Min. :1.000
## 1st Qu.: 419154 1st Qu.: 5.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 548382 Median :10.00 Median :4.000 Median :1.000
## Mean : 636208 Mean :10.14 Mean :3.617 Mean :1.482
## 3rd Qu.: 829742 3rd Qu.:15.00 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :1082650 Max. :20.00 Max. :9.000 Max. :4.000
##
## Team_Batting Team_Bowling Striker_Batting_Position
## Length:150451 Length:150451 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000
## Mode :character Mode :character Median : 3.000
## Mean : 3.584
## 3rd Qu.: 5.000
## Max. :11.000
## NA's :13861
## Extra_Type Runs_Scored Extra_runs Wides
## Length:150451 Min. :0.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :1.000 Median :0.00000 Median :0.0000
## Mean :1.222 Mean :0.06899 Mean :0.0375
## 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :6.000 Max. :5.00000 Max. :5.0000
##
## Legbyes Byes Noballs Penalty
## Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0e+00
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0e+00
## Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0e+00
## Mean :0.02223 Mean :0.004885 Mean :0.00434 Mean :3.3e-05
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0e+00
## Max. :5.00000 Max. :4.000000 Max. :5.00000 Max. :5.0e+00
##
## Bowler_Extras Out_type Caught Bowled
## Min. :0.00000 Length:150451 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.00000 Class :character 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.00000 Mode :character Median :0.00000 Median :0.000000
## Mean :0.04184 Mean :0.02907 Mean :0.009186
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :5.00000 Max. :1.00000 Max. :1.000000
##
## Run_out LBW Retired_hurt Stumped
## Min. :0.000000 Min. :0.000000 Min. :0.00e+00 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.00e+00 Median :0.000000
## Mean :0.005018 Mean :0.003024 Mean :5.98e-05 Mean :0.001615
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00e+00 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.000000 Max. :1.00e+00 Max. :1.000000
##
## caught_and_bowled hit_wicket ObstructingFeild Bowler_Wicket
## Min. :0.000000 Min. :0.00e+00 Min. :0.0e+00 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.0e+00 1st Qu.:0.00000
## Median :0.000000 Median :0.00e+00 Median :0.0e+00 Median :0.00000
## Mean :0.001402 Mean :5.98e-05 Mean :6.6e-06 Mean :0.04435
## 3rd Qu.:0.000000 3rd Qu.:0.00e+00 3rd Qu.:0.0e+00 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.00e+00 Max. :1.0e+00 Max. :1.00000
##
## Match_Date Season Striker Non_Striker
## Length:150451 Min. :2008 Min. : 1.0 Min. : 1.0
## Class :character 1st Qu.:2010 1st Qu.: 40.0 1st Qu.: 40.0
## Mode :character Median :2012 Median : 96.0 Median : 96.0
## Mean :2012 Mean :136.5 Mean :135.6
## 3rd Qu.:2015 3rd Qu.:208.0 3rd Qu.:208.0
## Max. :2017 Max. :497.0 Max. :497.0
##
## Bowler Player_Out Fielders Striker_match_SK
## Min. : 1.0 Min. : 1.0 Min. : 1.0 Min. :12694
## 1st Qu.: 77.0 1st Qu.: 41.0 1st Qu.: 47.0 1st Qu.:16173
## Median :174.0 Median :107.0 Median :111.0 Median :19672
## Mean :194.1 Mean :148.6 Mean :155.4 Mean :19675
## 3rd Qu.:310.0 3rd Qu.:236.0 3rd Qu.:237.5 3rd Qu.:23127
## Max. :497.0 Max. :497.0 Max. :497.0 Max. :26685
## NA's :143013 NA's :145100
## StrikerSK NonStriker_match_SK NONStriker_SK Fielder_match_SK
## Min. : 0.0 Min. :12694 Min. : 0.0 Min. : -1
## 1st Qu.: 39.0 1st Qu.:16173 1st Qu.: 39.0 1st Qu.: -1
## Median : 95.0 Median :19672 Median : 95.0 Median : -1
## Mean :135.5 Mean :19675 Mean :134.6 Mean : 690
## 3rd Qu.:207.0 3rd Qu.:23127 3rd Qu.:207.0 3rd Qu.: -1
## Max. :496.0 Max. :26685 Max. :496.0 Max. :26680
##
## Fielder_SK Bowler_match_SK BOWLER_SK PlayerOut_match_SK
## Min. : -1.000 Min. :12697 Min. : 0.0 Min. : -1.0
## 1st Qu.: -1.000 1st Qu.:16175 1st Qu.: 76.0 1st Qu.: -1.0
## Median : -1.000 Median :19674 Median :173.0 Median : -1.0
## Mean : 4.527 Mean :19677 Mean :193.1 Mean : 970.3
## 3rd Qu.: -1.000 3rd Qu.:23131 3rd Qu.:309.0 3rd Qu.: -1.0
## Max. :496.000 Max. :26685 Max. :496.0 Max. :26685.0
##
## BattingTeam_SK BowlingTeam_SK Keeper_Catch Player_out_sk
## Min. : 0.000 Min. : 0.000 Min. :0.000000 Min. : -1.000
## 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:0.000000 1st Qu.: 0.000
## Median : 4.000 Median : 4.000 Median :0.000000 Median : 0.000
## Mean : 4.346 Mean : 4.333 Mean :0.000432 Mean : 1.101
## 3rd Qu.: 6.000 3rd Qu.: 6.000 3rd Qu.:0.000000 3rd Qu.: 0.000
## Max. :12.000 Max. :12.000 Max. :1.000000 Max. :496.000
##
## MatchDateSK
## Min. :20080418
## 1st Qu.:20100411
## Median :20120520
## Mean :20125288
## 3rd Qu.:20150420
## Max. :20170521
##
#Group by ‘Team_Batting’ and Summarize ‘Runs_Scored’ #Groups the data by the ‘Team_Batting’ column and calculates the total runs, average runs, and maximum runs scored by each team while batting.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Group by 'Team_Batting' and summarize 'Runs_Scored'
summary1 <- my_data |>
group_by(Team_Batting) |>
summarise(
Total_Runs = sum(Runs_Scored),
Average_Runs = mean(Runs_Scored),
Max_Runs = max(Runs_Scored),.groups = 'drop'
)
# View the summary
print(summary1)
## # A tibble: 21 × 4
## Team_Batting Total_Runs Average_Runs Max_Runs
## <chr> <int> <dbl> <int>
## 1 1 18267 1.18 6
## 2 10 6040 1.11 6
## 3 11 8806 1.19 6
## 4 12 1962 1.24 6
## 5 13 2318 1.23 6
## 6 2 20470 1.27 6
## 7 3 19895 1.26 6
## 8 4 19720 1.23 6
## 9 5 16703 1.21 6
## 10 6 18595 1.20 6
## # ℹ 11 more rows
#Groups the data by the ‘Innings_No’ column and calculates the total wides and total no-balls in each inning.
library(dplyr)
# Group by 'Innings_No' and summarize 'Wides' and 'Noballs'
summary2 <- my_data |>
group_by(Innings_No) |>
summarise(
Total_Wides = sum(Wides),
Total_Noballs = sum(Noballs),.groups = 'drop'
)
# View the summary
print(summary2)
## # A tibble: 4 × 3
## Innings_No Total_Wides Total_Noballs
## <int> <int> <int>
## 1 1 2904 342
## 2 2 2736 307
## 3 3 1 2
## 4 4 1 2
#Groups the data by the ‘Innings_No’ column and calculates the total wides and total no-balls in each inning.
library(dplyr)
# Group by 'Season' and 'Team_Batting' and summarize 'Runs_Scored' and 'Extra_runs'
summary3 <- my_data |>
group_by(Season, Team_Batting) |>
summarise(
Total_Runs = sum(Runs_Scored),
Total_Extra_Runs = sum(Extra_runs),.groups = 'drop' # Override the grouping structure
)
# View the summary
print(summary3)
## # A tibble: 84 × 4
## Season Team_Batting Total_Runs Total_Extra_Runs
## <int> <chr> <int> <int>
## 1 2008 1 1795 141
## 2 2008 2 1865 118
## 3 2008 3 2365 155
## 4 2008 4 2340 124
## 5 2008 5 2420 177
## 6 2008 6 2000 118
## 7 2008 7 1897 183
## 8 2008 8 2117 112
## 9 2009 1 1645 90
## 10 2009 2 2166 114
## # ℹ 74 more rows
#Grouping Seasons and Team_Batting together
library(dplyr)
# Calculate the probability for each group
result <- my_data %>%
group_by(Season, Team_Batting) %>%
summarise(Total_Runs = sum(Runs_Scored),.groups = 'drop') %>%
mutate(Probability = Total_Runs / sum(Total_Runs))
View(result)
# Identify the group with the lowest probability
lowest_probability_group <- result %>%
filter(Probability == min(Probability))
# Assign an "anomaly" tag to the lowest probability group
result <- result %>%
mutate(Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))
# Translate this information back into your original data frame
data_with_tags <- left_join(my_data, result, by = c("Season", "Team_Batting"))
# Example visualization (bar chart):
library(ggplot2)
ggplot(data_with_tags, aes(x = Team_Batting, fill = Tag)) +
geom_bar() +
labs(title = "Rarity of Batting Teams", x = "Team Batting", y = "Frequency") +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))
#Grouping by Innings_No
library(dplyr)
# Calculate the probability for each inning group
innings_result <- my_data %>%
group_by(Season, Innings_No) %>%
summarise(Total_Runs = sum(Runs_Scored),.groups = 'drop') %>%
mutate(Probability = Total_Runs / sum(Total_Runs))
# Identify the inning group with the lowest probability
lowest_probability_inning <- innings_result %>%
filter(Probability == min(Probability))
# Assign an "anomaly" tag to the lowest probability inning group
innings_result <- innings_result %>%
mutate(Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))
# Translate this information back into your original data frame
data_with_inning_tags <- left_join(my_data, innings_result, by = c("Season", "Innings_No"))
print(head((data_with_inning_tags)))
## MatcH_id Over_id Ball_id Innings_No Team_Batting Team_Bowling
## 1 598028 15 6 1 5 2
## 2 598028 14 1 1 5 2
## 3 598028 14 2 1 5 2
## 4 598028 14 3 1 5 2
## 5 598028 14 4 1 5 2
## 6 598028 14 5 1 5 2
## Striker_Batting_Position Extra_Type Runs_Scored Extra_runs Wides Legbyes Byes
## 1 6 No Extras 4 0 0 0 0
## 2 5 No Extras 1 0 0 0 0
## 3 3 No Extras 1 0 0 0 0
## 4 5 No Extras 1 0 0 0 0
## 5 3 No Extras 0 0 0 0 0
## 6 3 No Extras 4 0 0 0 0
## Noballs Penalty Bowler_Extras Out_type Caught Bowled Run_out LBW
## 1 0 0 0 Not Applicable 0 0 0 0
## 2 0 0 0 Not Applicable 0 0 0 0
## 3 0 0 0 Not Applicable 0 0 0 0
## 4 0 0 0 Not Applicable 0 0 0 0
## 5 0 0 0 Not Applicable 0 0 0 0
## 6 0 0 0 Not Applicable 0 0 0 0
## Retired_hurt Stumped caught_and_bowled hit_wicket ObstructingFeild
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## Bowler_Wicket Match_Date Season Striker Non_Striker Bowler Player_Out
## 1 0 4/20/2013 2013 277 104 83 NA
## 2 0 4/20/2013 2013 104 6 346 NA
## 3 0 4/20/2013 2013 6 104 346 NA
## 4 0 4/20/2013 2013 104 6 346 NA
## 5 0 4/20/2013 2013 6 104 346 NA
## 6 0 4/20/2013 2013 6 104 346 NA
## Fielders Striker_match_SK StrikerSK NonStriker_match_SK NONStriker_SK
## 1 NA 20336 276 20333 103
## 2 NA 20333 103 20328 5
## 3 NA 20328 5 20333 103
## 4 NA 20333 103 20328 5
## 5 NA 20328 5 20333 103
## 6 NA 20328 5 20333 103
## Fielder_match_SK Fielder_SK Bowler_match_SK BOWLER_SK PlayerOut_match_SK
## 1 -1 -1 20343 82 -1
## 2 -1 -1 20348 345 -1
## 3 -1 -1 20348 345 -1
## 4 -1 -1 20348 345 -1
## 5 -1 -1 20348 345 -1
## 6 -1 -1 20348 345 -1
## BattingTeam_SK BowlingTeam_SK Keeper_Catch Player_out_sk MatchDateSK
## 1 4 1 0 0 20130420
## 2 4 1 0 0 20130420
## 3 4 1 0 0 20130420
## 4 4 1 0 0 20130420
## 5 4 1 0 0 20130420
## 6 4 1 0 0 20130420
## Total_Runs Probability Tag
## 1 11282 0.0613549 Normal
## 2 11282 0.0613549 Normal
## 3 11282 0.0613549 Normal
## 4 11282 0.0613549 Normal
## 5 11282 0.0613549 Normal
## 6 11282 0.0613549 Normal
# Visualization:
# - Bar chart: Visualize the rarity of each inning group.
library(ggplot2)
ggplot(data_with_inning_tags, aes(x = Innings_No, fill = Tag)) +
geom_bar() +
labs(title = "Rarity of Innings", x = "Innings_No", y = "Frequency") +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))
library(ggplot2)
# Violin plot to visualize the rarity of innings
ggplot(data_with_inning_tags, aes(x = Tag, y = Probability, fill = Tag)) +
geom_violin() +
labs(title = "Rarity of Innings (Violin Plot)", x = "Tag", y = "Probability") +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))
## Scatterplot to visualize the rarity of inning
library(ggplot2)
# Scatterplot to visualize the rarity of innings
ggplot(data_with_inning_tags, aes(x = Innings_No, y = Probability, color = Tag)) +
geom_point() +
labs(title = "Rarity of Innings (Scatterplot)", x = "Innings_No", y = "Probability") +
theme_minimal() +
scale_color_manual(values = c("Normal" = "blue", "Anomaly" = "red")) +
geom_text(aes(label = Tag), nudge_y = 0.02, show.legend = FALSE) # Annotate points with "Tag"
# Box Plot to visualize the rarity of innings
#This Box Plot will help to visualize the distribution of probabilities for "Normal" and "Anomaly" innings.
ggplot(data_with_inning_tags, aes(x = Tag, y = Probability, fill = Tag)) +
geom_boxplot() +
labs(title = "Rarity of Innings (Box Plot)", x = "Tag", y = "Probability") +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "yellow", "Anomaly" = "red"))
# Time Series Plot to visualize the change in rarity over time
ggplot(data_with_inning_tags, aes(x = Season, y = Probability, color = Tag, group = Tag)) +
geom_line() +
labs(title = "Change in Rarity of Innings Over Time", x = "Season", y = "Probability") +
theme_minimal() +
scale_color_manual(values = c("Normal" = "orange", "Anomaly" = "red")) +
geom_point() # Add points for each data point
#Pick 2-3 categorical variables.
#These are two categorical variables for which we know all possible combinations:
#Team_Batting: This variable represents the batting team. The possible combinations are the unique values of the batting teams in the dataset.
#Out_type: This variable represents the type of dismissal (e.g., "Caught," "Bowled," "Run_out"). The possible combinations are the unique types of dismissal in the dataset.
#Which combinations never show up? Why might that be?
# To identify combinations that never show up and understand why that might be the case, you can perform a cross-tabulation (contingency table) analysis of the two categorical variables: Team_Batting and Out_type. This will allow you to see which combinations are missing and potentially provide insights into why they are absent.
library(dplyr)
library(tidyr)
# Create a contingency table
contingency_table <- my_data %>%
count(Team_Batting, Out_type) %>%
spread(Out_type, n, fill = 0)
print(contingency_table)
## Team_Batting bowled caught caught and bowled hit wicket
## 1 1 134 356 17 0
## 2 10 65 139 11 0
## 3 11 63 180 8 3
## 4 12 10 29 2 0
## 5 13 24 45 3 0
## 6 2 168 360 19 2
## 7 3 105 369 24 1
## 8 4 149 417 24 0
## 9 5 137 314 24 1
## 10 6 136 411 17 0
## 11 7 149 422 23 1
## 12 8 91 247 13 0
## 13 9 20 35 2 0
## 14 Delhi Daredevils 20 42 5 0
## 15 Gujarat Lions 17 42 3 0
## 16 Kings XI Punjab 19 42 2 0
## 17 Kolkata Knight Riders 12 50 3 1
## 18 Mumbai Indians 12 56 3 0
## 19 Rising Pune Supergiants 19 39 4 0
## 20 Royal Challengers Bangalore 21 49 0 0
## 21 Sunrisers Hyderabad 11 34 4 0
## Keeper Catch lbw Not Applicable obstructing the field retired hurt run out
## 1 68 55 14680 1 0 83
## 2 24 16 5145 0 0 31
## 3 35 16 7020 0 0 43
## 4 10 1 1512 0 1 9
## 5 11 5 1789 0 0 9
## 6 80 53 15370 0 0 69
## 7 58 39 15129 0 0 71
## 8 79 48 15159 0 0 93
## 9 71 31 13159 0 0 78
## 10 71 44 14707 0 0 73
## 11 76 57 16148 0 6 78
## 12 38 27 8549 0 1 49
## 13 9 7 1496 0 0 11
## 14 11 7 1553 0 0 9
## 15 5 8 1589 0 0 8
## 16 9 8 1520 0 0 2
## 17 6 6 1678 0 0 6
## 18 10 11 1944 0 1 13
## 19 9 8 1810 0 0 8
## 20 9 6 1439 0 0 10
## 21 6 2 1617 0 0 2
## stumped
## 1 22
## 2 12
## 3 11
## 4 6
## 5 5
## 6 19
## 7 25
## 8 22
## 9 30
## 10 22
## 11 28
## 12 18
## 13 2
## 14 3
## 15 3
## 16 1
## 17 4
## 18 2
## 19 3
## 20 2
## 21 3
# Identify combinations with zero counts
missing_combinations <- contingency_table %>%
filter_all(all_vars(. == 0))
print(missing_combinations)
## [1] Team_Batting bowled caught
## [4] caught and bowled hit wicket Keeper Catch
## [7] lbw Not Applicable obstructing the field
## [10] retired hurt run out stumped
## <0 rows> (or 0-length row.names)
#conclusions
#There are 21 records in contingency_table and zero records in missing_combinations.
# It means that there are no combinations of the Team_Batting and Out_type variables in your dataset that have zero counts.
#There are no missing or unobserved combinations in the dataset.
# Create a contingency table
contingency_table <- table(my_data$Team_Batting, my_data$Out_type)
# Convert the table to a data frame
contingency_df <- as.data.frame.matrix(contingency_table)
# Set row and column names for better labeling
rownames(contingency_df) <- levels(my_data$Team_Batting)
colnames(contingency_df) <- levels(my_data$Out_type)
# Create a bar plot
barplot(as.matrix(contingency_df),
beside = TRUE,
col = rainbow(nrow(contingency_df)),
legend.text = rownames(contingency_df),
xlab = "Out Type",
ylab = "Frequency",
main = "Combinations of Team Batting and Out Type")