R Markdown

# Read the CSV file
my_data <- read.csv('C:/Users/dell/Downloads/Ball_By_Ball.csv')
summary(my_data)
##     MatcH_id          Over_id         Ball_id        Innings_No   
##  Min.   : 335987   Min.   : 1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 419154   1st Qu.: 5.00   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 548382   Median :10.00   Median :4.000   Median :1.000  
##  Mean   : 636208   Mean   :10.14   Mean   :3.617   Mean   :1.482  
##  3rd Qu.: 829742   3rd Qu.:15.00   3rd Qu.:5.000   3rd Qu.:2.000  
##  Max.   :1082650   Max.   :20.00   Max.   :9.000   Max.   :4.000  
##                                                                   
##  Team_Batting       Team_Bowling       Striker_Batting_Position
##  Length:150451      Length:150451      Min.   : 1.000          
##  Class :character   Class :character   1st Qu.: 2.000          
##  Mode  :character   Mode  :character   Median : 3.000          
##                                        Mean   : 3.584          
##                                        3rd Qu.: 5.000          
##                                        Max.   :11.000          
##                                        NA's   :13861           
##   Extra_Type         Runs_Scored      Extra_runs          Wides       
##  Length:150451      Min.   :0.000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :1.000   Median :0.00000   Median :0.0000  
##                     Mean   :1.222   Mean   :0.06899   Mean   :0.0375  
##                     3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##                     Max.   :6.000   Max.   :5.00000   Max.   :5.0000  
##                                                                       
##     Legbyes             Byes             Noballs           Penalty       
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0e+00  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0e+00  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :0.0e+00  
##  Mean   :0.02223   Mean   :0.004885   Mean   :0.00434   Mean   :3.3e-05  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.0e+00  
##  Max.   :5.00000   Max.   :4.000000   Max.   :5.00000   Max.   :5.0e+00  
##                                                                          
##  Bowler_Extras       Out_type             Caught            Bowled        
##  Min.   :0.00000   Length:150451      Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.00000   Class :character   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.00000   Mode  :character   Median :0.00000   Median :0.000000  
##  Mean   :0.04184                      Mean   :0.02907   Mean   :0.009186  
##  3rd Qu.:0.00000                      3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :5.00000                      Max.   :1.00000   Max.   :1.000000  
##                                                                           
##     Run_out              LBW            Retired_hurt         Stumped        
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.00e+00   Median :0.000000  
##  Mean   :0.005018   Mean   :0.003024   Mean   :5.98e-05   Mean   :0.001615  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00e+00   3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.00e+00   Max.   :1.000000  
##                                                                             
##  caught_and_bowled    hit_wicket       ObstructingFeild  Bowler_Wicket    
##  Min.   :0.000000   Min.   :0.00e+00   Min.   :0.0e+00   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.0e+00   1st Qu.:0.00000  
##  Median :0.000000   Median :0.00e+00   Median :0.0e+00   Median :0.00000  
##  Mean   :0.001402   Mean   :5.98e-05   Mean   :6.6e-06   Mean   :0.04435  
##  3rd Qu.:0.000000   3rd Qu.:0.00e+00   3rd Qu.:0.0e+00   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.00e+00   Max.   :1.0e+00   Max.   :1.00000  
##                                                                           
##   Match_Date            Season        Striker       Non_Striker   
##  Length:150451      Min.   :2008   Min.   :  1.0   Min.   :  1.0  
##  Class :character   1st Qu.:2010   1st Qu.: 40.0   1st Qu.: 40.0  
##  Mode  :character   Median :2012   Median : 96.0   Median : 96.0  
##                     Mean   :2012   Mean   :136.5   Mean   :135.6  
##                     3rd Qu.:2015   3rd Qu.:208.0   3rd Qu.:208.0  
##                     Max.   :2017   Max.   :497.0   Max.   :497.0  
##                                                                   
##      Bowler        Player_Out        Fielders      Striker_match_SK
##  Min.   :  1.0   Min.   :  1.0    Min.   :  1.0    Min.   :12694   
##  1st Qu.: 77.0   1st Qu.: 41.0    1st Qu.: 47.0    1st Qu.:16173   
##  Median :174.0   Median :107.0    Median :111.0    Median :19672   
##  Mean   :194.1   Mean   :148.6    Mean   :155.4    Mean   :19675   
##  3rd Qu.:310.0   3rd Qu.:236.0    3rd Qu.:237.5    3rd Qu.:23127   
##  Max.   :497.0   Max.   :497.0    Max.   :497.0    Max.   :26685   
##                  NA's   :143013   NA's   :145100                   
##    StrikerSK     NonStriker_match_SK NONStriker_SK   Fielder_match_SK
##  Min.   :  0.0   Min.   :12694       Min.   :  0.0   Min.   :   -1   
##  1st Qu.: 39.0   1st Qu.:16173       1st Qu.: 39.0   1st Qu.:   -1   
##  Median : 95.0   Median :19672       Median : 95.0   Median :   -1   
##  Mean   :135.5   Mean   :19675       Mean   :134.6   Mean   :  690   
##  3rd Qu.:207.0   3rd Qu.:23127       3rd Qu.:207.0   3rd Qu.:   -1   
##  Max.   :496.0   Max.   :26685       Max.   :496.0   Max.   :26680   
##                                                                      
##    Fielder_SK      Bowler_match_SK   BOWLER_SK     PlayerOut_match_SK
##  Min.   : -1.000   Min.   :12697   Min.   :  0.0   Min.   :   -1.0   
##  1st Qu.: -1.000   1st Qu.:16175   1st Qu.: 76.0   1st Qu.:   -1.0   
##  Median : -1.000   Median :19674   Median :173.0   Median :   -1.0   
##  Mean   :  4.527   Mean   :19677   Mean   :193.1   Mean   :  970.3   
##  3rd Qu.: -1.000   3rd Qu.:23131   3rd Qu.:309.0   3rd Qu.:   -1.0   
##  Max.   :496.000   Max.   :26685   Max.   :496.0   Max.   :26685.0   
##                                                                      
##  BattingTeam_SK   BowlingTeam_SK    Keeper_Catch      Player_out_sk    
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.000000   Min.   : -1.000  
##  1st Qu.: 2.000   1st Qu.: 2.000   1st Qu.:0.000000   1st Qu.:  0.000  
##  Median : 4.000   Median : 4.000   Median :0.000000   Median :  0.000  
##  Mean   : 4.346   Mean   : 4.333   Mean   :0.000432   Mean   :  1.101  
##  3rd Qu.: 6.000   3rd Qu.: 6.000   3rd Qu.:0.000000   3rd Qu.:  0.000  
##  Max.   :12.000   Max.   :12.000   Max.   :1.000000   Max.   :496.000  
##                                                                        
##   MatchDateSK      
##  Min.   :20080418  
##  1st Qu.:20100411  
##  Median :20120520  
##  Mean   :20125288  
##  3rd Qu.:20150420  
##  Max.   :20170521  
## 

Group by:

#Group by ‘Team_Batting’ and Summarize ‘Runs_Scored’ #Groups the data by the ‘Team_Batting’ column and calculates the total runs, average runs, and maximum runs scored by each team while batting.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Group by 'Team_Batting' and summarize 'Runs_Scored'
summary1 <- my_data |>
  group_by(Team_Batting) |>
  summarise(
    Total_Runs = sum(Runs_Scored),
    Average_Runs = mean(Runs_Scored),
    Max_Runs = max(Runs_Scored),.groups = 'drop'
  )

# View the summary
print(summary1)
## # A tibble: 21 × 4
##    Team_Batting Total_Runs Average_Runs Max_Runs
##    <chr>             <int>        <dbl>    <int>
##  1 1                 18267         1.18        6
##  2 10                 6040         1.11        6
##  3 11                 8806         1.19        6
##  4 12                 1962         1.24        6
##  5 13                 2318         1.23        6
##  6 2                 20470         1.27        6
##  7 3                 19895         1.26        6
##  8 4                 19720         1.23        6
##  9 5                 16703         1.21        6
## 10 6                 18595         1.20        6
## # ℹ 11 more rows

#Groups the data by the ‘Innings_No’ column and calculates the total wides and total no-balls in each inning.

library(dplyr)
# Group by 'Innings_No' and summarize 'Wides' and 'Noballs'
summary2 <- my_data |>
  group_by(Innings_No) |>
  summarise(
    Total_Wides = sum(Wides),
    Total_Noballs = sum(Noballs),.groups = 'drop'
  )

# View the summary
print(summary2)
## # A tibble: 4 × 3
##   Innings_No Total_Wides Total_Noballs
##        <int>       <int>         <int>
## 1          1        2904           342
## 2          2        2736           307
## 3          3           1             2
## 4          4           1             2

#Groups the data by the ‘Innings_No’ column and calculates the total wides and total no-balls in each inning.

library(dplyr)

# Group by 'Season' and 'Team_Batting' and summarize 'Runs_Scored' and 'Extra_runs'
summary3 <- my_data |>
  group_by(Season, Team_Batting) |>
  summarise(
    Total_Runs = sum(Runs_Scored),
    Total_Extra_Runs = sum(Extra_runs),.groups = 'drop'  # Override the grouping structure
  )

# View the summary
print(summary3)
## # A tibble: 84 × 4
##    Season Team_Batting Total_Runs Total_Extra_Runs
##     <int> <chr>             <int>            <int>
##  1   2008 1                  1795              141
##  2   2008 2                  1865              118
##  3   2008 3                  2365              155
##  4   2008 4                  2340              124
##  5   2008 5                  2420              177
##  6   2008 6                  2000              118
##  7   2008 7                  1897              183
##  8   2008 8                  2117              112
##  9   2009 1                  1645               90
## 10   2009 2                  2166              114
## # ℹ 74 more rows

#Grouping Seasons and Team_Batting together

library(dplyr)

#  Calculate the probability for each group
result <- my_data %>%
  group_by(Season, Team_Batting) %>%
  summarise(Total_Runs = sum(Runs_Scored),.groups = 'drop') %>%
  mutate(Probability = Total_Runs / sum(Total_Runs))
View(result)
#  Identify the group with the lowest probability
lowest_probability_group <- result %>%
  filter(Probability == min(Probability))

# Assign an "anomaly" tag to the lowest probability group
result <- result %>%
  mutate(Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))

# Translate this information back into your original data frame
data_with_tags <- left_join(my_data, result, by = c("Season", "Team_Batting"))


# Example visualization (bar chart):
library(ggplot2)
ggplot(data_with_tags, aes(x = Team_Batting, fill = Tag)) +
  geom_bar() +
  labs(title = "Rarity of Batting Teams", x = "Team Batting", y = "Frequency") +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))

#Grouping by Innings_No

library(dplyr)

# Calculate the probability for each inning group
innings_result <- my_data %>%
  group_by(Season, Innings_No) %>%
  summarise(Total_Runs = sum(Runs_Scored),.groups = 'drop') %>%
  mutate(Probability = Total_Runs / sum(Total_Runs))

# Identify the inning group with the lowest probability
lowest_probability_inning <- innings_result %>%
  filter(Probability == min(Probability))

# Assign an "anomaly" tag to the lowest probability inning group
innings_result <- innings_result %>%
  mutate(Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))

# Translate this information back into your original data frame
data_with_inning_tags <- left_join(my_data, innings_result, by = c("Season", "Innings_No"))

print(head((data_with_inning_tags)))
##   MatcH_id Over_id Ball_id Innings_No Team_Batting Team_Bowling
## 1   598028      15       6          1            5            2
## 2   598028      14       1          1            5            2
## 3   598028      14       2          1            5            2
## 4   598028      14       3          1            5            2
## 5   598028      14       4          1            5            2
## 6   598028      14       5          1            5            2
##   Striker_Batting_Position Extra_Type Runs_Scored Extra_runs Wides Legbyes Byes
## 1                        6  No Extras           4          0     0       0    0
## 2                        5  No Extras           1          0     0       0    0
## 3                        3  No Extras           1          0     0       0    0
## 4                        5  No Extras           1          0     0       0    0
## 5                        3  No Extras           0          0     0       0    0
## 6                        3  No Extras           4          0     0       0    0
##   Noballs Penalty Bowler_Extras       Out_type Caught Bowled Run_out LBW
## 1       0       0             0 Not Applicable      0      0       0   0
## 2       0       0             0 Not Applicable      0      0       0   0
## 3       0       0             0 Not Applicable      0      0       0   0
## 4       0       0             0 Not Applicable      0      0       0   0
## 5       0       0             0 Not Applicable      0      0       0   0
## 6       0       0             0 Not Applicable      0      0       0   0
##   Retired_hurt Stumped caught_and_bowled hit_wicket ObstructingFeild
## 1            0       0                 0          0                0
## 2            0       0                 0          0                0
## 3            0       0                 0          0                0
## 4            0       0                 0          0                0
## 5            0       0                 0          0                0
## 6            0       0                 0          0                0
##   Bowler_Wicket Match_Date Season Striker Non_Striker Bowler Player_Out
## 1             0  4/20/2013   2013     277         104     83         NA
## 2             0  4/20/2013   2013     104           6    346         NA
## 3             0  4/20/2013   2013       6         104    346         NA
## 4             0  4/20/2013   2013     104           6    346         NA
## 5             0  4/20/2013   2013       6         104    346         NA
## 6             0  4/20/2013   2013       6         104    346         NA
##   Fielders Striker_match_SK StrikerSK NonStriker_match_SK NONStriker_SK
## 1       NA            20336       276               20333           103
## 2       NA            20333       103               20328             5
## 3       NA            20328         5               20333           103
## 4       NA            20333       103               20328             5
## 5       NA            20328         5               20333           103
## 6       NA            20328         5               20333           103
##   Fielder_match_SK Fielder_SK Bowler_match_SK BOWLER_SK PlayerOut_match_SK
## 1               -1         -1           20343        82                 -1
## 2               -1         -1           20348       345                 -1
## 3               -1         -1           20348       345                 -1
## 4               -1         -1           20348       345                 -1
## 5               -1         -1           20348       345                 -1
## 6               -1         -1           20348       345                 -1
##   BattingTeam_SK BowlingTeam_SK Keeper_Catch Player_out_sk MatchDateSK
## 1              4              1            0             0    20130420
## 2              4              1            0             0    20130420
## 3              4              1            0             0    20130420
## 4              4              1            0             0    20130420
## 5              4              1            0             0    20130420
## 6              4              1            0             0    20130420
##   Total_Runs Probability    Tag
## 1      11282   0.0613549 Normal
## 2      11282   0.0613549 Normal
## 3      11282   0.0613549 Normal
## 4      11282   0.0613549 Normal
## 5      11282   0.0613549 Normal
## 6      11282   0.0613549 Normal
# Visualization:
# - Bar chart: Visualize the rarity of each inning group.
library(ggplot2)
ggplot(data_with_inning_tags, aes(x = Innings_No, fill = Tag)) +
  geom_bar() +
  labs(title = "Rarity of Innings", x = "Innings_No", y = "Frequency") +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))

Violin plot to visualize the rarity of innings

library(ggplot2)

# Violin plot to visualize the rarity of innings
ggplot(data_with_inning_tags, aes(x = Tag, y = Probability, fill = Tag)) +
  geom_violin() +
  labs(title = "Rarity of Innings (Violin Plot)", x = "Tag", y = "Probability") +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red"))

## Scatterplot to visualize the rarity of inning

library(ggplot2)

# Scatterplot to visualize the rarity of innings
ggplot(data_with_inning_tags, aes(x = Innings_No, y = Probability, color = Tag)) +
  geom_point() +
  labs(title = "Rarity of Innings (Scatterplot)", x = "Innings_No", y = "Probability") +
  theme_minimal() +
  scale_color_manual(values = c("Normal" = "blue", "Anomaly" = "red")) +
  geom_text(aes(label = Tag), nudge_y = 0.02, show.legend = FALSE)  # Annotate points with "Tag"

Box Plot to visualize the rarity of innings

Time Series Plot to visualize the change in rarity over time

# Box Plot to visualize the rarity of innings
#This Box Plot will help to visualize the distribution of probabilities for "Normal" and "Anomaly" innings.
ggplot(data_with_inning_tags, aes(x = Tag, y = Probability, fill = Tag)) +
  geom_boxplot() +
  labs(title = "Rarity of Innings (Box Plot)", x = "Tag", y = "Probability") +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "yellow", "Anomaly" = "red"))

# Time Series Plot to visualize the change in rarity over time
ggplot(data_with_inning_tags, aes(x = Season, y = Probability, color = Tag, group = Tag)) +
  geom_line() +
  labs(title = "Change in Rarity of Innings Over Time", x = "Season", y = "Probability") +
  theme_minimal() +
  scale_color_manual(values = c("Normal" = "orange", "Anomaly" = "red")) +
  geom_point()  # Add points for each data point

#Pick 2-3 categorical variables.

#These are two categorical variables for which we know all possible combinations:

#Team_Batting: This variable represents the batting team. The possible combinations are the unique values of the batting teams in the dataset.

#Out_type: This variable represents the type of dismissal (e.g., "Caught," "Bowled," "Run_out"). The possible combinations are the unique types of dismissal in the dataset.

#Which combinations never show up? Why might that be?
# To identify combinations that never show up and understand why that might be the case, you can perform a cross-tabulation (contingency table) analysis of the two categorical variables: Team_Batting and Out_type. This will allow you to see which combinations are missing and potentially provide insights into why they are absent.
library(dplyr)
library(tidyr)

# Create a contingency table
contingency_table <- my_data %>%
  count(Team_Batting, Out_type) %>%
  spread(Out_type, n, fill = 0)
  print(contingency_table)
##                   Team_Batting bowled caught caught and bowled hit wicket
## 1                            1    134    356                17          0
## 2                           10     65    139                11          0
## 3                           11     63    180                 8          3
## 4                           12     10     29                 2          0
## 5                           13     24     45                 3          0
## 6                            2    168    360                19          2
## 7                            3    105    369                24          1
## 8                            4    149    417                24          0
## 9                            5    137    314                24          1
## 10                           6    136    411                17          0
## 11                           7    149    422                23          1
## 12                           8     91    247                13          0
## 13                           9     20     35                 2          0
## 14            Delhi Daredevils     20     42                 5          0
## 15               Gujarat Lions     17     42                 3          0
## 16             Kings XI Punjab     19     42                 2          0
## 17       Kolkata Knight Riders     12     50                 3          1
## 18              Mumbai Indians     12     56                 3          0
## 19     Rising Pune Supergiants     19     39                 4          0
## 20 Royal Challengers Bangalore     21     49                 0          0
## 21         Sunrisers Hyderabad     11     34                 4          0
##    Keeper Catch lbw Not Applicable obstructing the field retired hurt run out
## 1            68  55          14680                     1            0      83
## 2            24  16           5145                     0            0      31
## 3            35  16           7020                     0            0      43
## 4            10   1           1512                     0            1       9
## 5            11   5           1789                     0            0       9
## 6            80  53          15370                     0            0      69
## 7            58  39          15129                     0            0      71
## 8            79  48          15159                     0            0      93
## 9            71  31          13159                     0            0      78
## 10           71  44          14707                     0            0      73
## 11           76  57          16148                     0            6      78
## 12           38  27           8549                     0            1      49
## 13            9   7           1496                     0            0      11
## 14           11   7           1553                     0            0       9
## 15            5   8           1589                     0            0       8
## 16            9   8           1520                     0            0       2
## 17            6   6           1678                     0            0       6
## 18           10  11           1944                     0            1      13
## 19            9   8           1810                     0            0       8
## 20            9   6           1439                     0            0      10
## 21            6   2           1617                     0            0       2
##    stumped
## 1       22
## 2       12
## 3       11
## 4        6
## 5        5
## 6       19
## 7       25
## 8       22
## 9       30
## 10      22
## 11      28
## 12      18
## 13       2
## 14       3
## 15       3
## 16       1
## 17       4
## 18       2
## 19       3
## 20       2
## 21       3
# Identify combinations with zero counts
missing_combinations <- contingency_table %>%
  filter_all(all_vars(. == 0))
print(missing_combinations)
##  [1] Team_Batting          bowled                caught               
##  [4] caught and bowled     hit wicket            Keeper Catch         
##  [7] lbw                   Not Applicable        obstructing the field
## [10] retired hurt          run out               stumped              
## <0 rows> (or 0-length row.names)

#conclusions

#There are 21 records in contingency_table and zero records in missing_combinations.
# It means that there are no combinations of the Team_Batting and Out_type variables in your dataset that have zero counts.
#There are no missing or unobserved combinations in the dataset.
# Create a contingency table
contingency_table <- table(my_data$Team_Batting, my_data$Out_type)

# Convert the table to a data frame
contingency_df <- as.data.frame.matrix(contingency_table)

# Set row and column names for better labeling
rownames(contingency_df) <- levels(my_data$Team_Batting)
colnames(contingency_df) <- levels(my_data$Out_type)

# Create a bar plot
barplot(as.matrix(contingency_df),
        beside = TRUE,
        col = rainbow(nrow(contingency_df)),
        legend.text = rownames(contingency_df),
        xlab = "Out Type",
        ylab = "Frequency",
        main = "Combinations of Team Batting and Out Type")