#import the dataset to the console

data <- read.csv("~/Documents/Rdocs/matches.csv", stringsAsFactors = TRUE)

#import most used libraries.

library(dplyr)         
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#head() function is used to give user an overview of the dataset, it gives some of the rows for every vector.

head(data)
##       id  season       city       date match_type player_of_match
## 1 335982 2007/08  Bangalore 2008-04-18     League     BB McCullum
## 2 335983 2007/08 Chandigarh 2008-04-19     League      MEK Hussey
## 3 335984 2007/08      Delhi 2008-04-19     League     MF Maharoof
## 4 335985 2007/08     Mumbai 2008-04-20     League      MV Boucher
## 5 335986 2007/08    Kolkata 2008-04-20     League       DJ Hussey
## 6 335987 2007/08     Jaipur 2008-04-21     League       SR Watson
##                                        venue                       team1
## 1                      M Chinnaswamy Stadium Royal Challengers Bangalore
## 2 Punjab Cricket Association Stadium, Mohali             Kings XI Punjab
## 3                           Feroz Shah Kotla            Delhi Daredevils
## 4                           Wankhede Stadium              Mumbai Indians
## 5                               Eden Gardens       Kolkata Knight Riders
## 6                     Sawai Mansingh Stadium            Rajasthan Royals
##                         team2                 toss_winner toss_decision
## 1       Kolkata Knight Riders Royal Challengers Bangalore         field
## 2         Chennai Super Kings         Chennai Super Kings           bat
## 3            Rajasthan Royals            Rajasthan Royals           bat
## 4 Royal Challengers Bangalore              Mumbai Indians           bat
## 5             Deccan Chargers             Deccan Chargers           bat
## 6             Kings XI Punjab             Kings XI Punjab           bat
##                        winner  result result_margin target_runs target_overs
## 1       Kolkata Knight Riders    runs           140         223           20
## 2         Chennai Super Kings    runs            33         241           20
## 3            Delhi Daredevils wickets             9         130           20
## 4 Royal Challengers Bangalore wickets             5         166           20
## 5       Kolkata Knight Riders wickets             5         111           20
## 6            Rajasthan Royals wickets             6         167           20
##   super_over method   umpire1        umpire2
## 1          N   <NA> Asad Rauf    RE Koertzen
## 2          N   <NA> MR Benson     SL Shastri
## 3          N   <NA> Aleem Dar GA Pratapkumar
## 4          N   <NA>  SJ Davis      DJ Harper
## 5          N   <NA> BF Bowden    K Hariharan
## 6          N   <NA> Aleem Dar      RB Tiffin

#Summary function is used to give insights and computations for each vector. For numerical categories it gives min,max and mean,median,mode etc.

summary(data)
##        id              season           city             date     
##  Min.   : 335982   2013   : 76   Mumbai   :173   2008-04-19:   2  
##  1st Qu.: 548332   2012   : 74   Kolkata  : 93   2008-04-20:   2  
##  Median : 980961   2022   : 74   Delhi    : 90   2008-04-26:   2  
##  Mean   : 904828   2023   : 74   Chennai  : 85   2008-04-27:   2  
##  3rd Qu.:1254062   2011   : 73   Hyderabad: 77   2008-05-01:   2  
##  Max.   :1426312   2024   : 71   (Other)  :526   2008-05-03:   2  
##                    (Other):653   NA's     : 51   (Other)   :1083  
##        match_type         player_of_match
##  League     :1029   AB de Villiers: 25   
##  Final      :  17   CH Gayle      : 22   
##  Qualifier 1:  14   RG Sharma     : 19   
##  Qualifier 2:  14   DA Warner     : 18   
##  Eliminator :  11   V Kohli       : 18   
##  Semi Final :   6   (Other)       :988   
##  (Other)    :   4   NA's          :  5   
##                                        venue    
##  Eden Gardens                             : 77  
##  Wankhede Stadium                         : 73  
##  M Chinnaswamy Stadium                    : 65  
##  Feroz Shah Kotla                         : 60  
##  Rajiv Gandhi International Stadium, Uppal: 49  
##  MA Chidambaram Stadium, Chepauk          : 48  
##  (Other)                                  :723  
##                          team1                             team2    
##  Royal Challengers Bangalore:135   Mumbai Indians             :138  
##  Chennai Super Kings        :128   Kolkata Knight Riders      :130  
##  Mumbai Indians             :123   Rajasthan Royals           :120  
##  Kolkata Knight Riders      :121   Chennai Super Kings        :110  
##  Rajasthan Royals           :101   Royal Challengers Bangalore:105  
##  Kings XI Punjab            : 92   Kings XI Punjab            : 98  
##  (Other)                    :395   (Other)                    :394  
##                       toss_winner  toss_decision
##  Mumbai Indians             :143   bat  :391    
##  Chennai Super Kings        :122   field:704    
##  Kolkata Knight Riders      :122                
##  Rajasthan Royals           :120                
##  Royal Challengers Bangalore:113                
##  Sunrisers Hyderabad        : 88                
##  (Other)                    :387                
##                          winner          result    result_margin   
##  Mumbai Indians             :144   no result:  5   Min.   :  1.00  
##  Chennai Super Kings        :138   runs     :498   1st Qu.:  6.00  
##  Kolkata Knight Riders      :131   tie      : 14   Median :  8.00  
##  Royal Challengers Bangalore:116   wickets  :578   Mean   : 17.26  
##  Rajasthan Royals           :112                   3rd Qu.: 20.00  
##  (Other)                    :449                   Max.   :146.00  
##  NA's                       :  5                   NA's   :19      
##   target_runs     target_overs   super_over  method    
##  Min.   : 43.0   Min.   : 5.00   N:1081     D/L :  21  
##  1st Qu.:146.0   1st Qu.:20.00   Y:  14     NA's:1074  
##  Median :166.0   Median :20.00                         
##  Mean   :165.7   Mean   :19.76                         
##  3rd Qu.:187.0   3rd Qu.:20.00                         
##  Max.   :288.0   Max.   :20.00                         
##  NA's   :3       NA's   :3                             
##                   umpire1             umpire2   
##  AK Chaudhary         :115   S Ravi       : 83  
##  HDPK Dharmasena      : 79   VK Sharma    : 61  
##  KN Ananthapadmanabhan: 59   C Shamshuddin: 60  
##  CB Gaffaney          : 53   Nitin Menon  : 54  
##  Asad Rauf            : 51   SJA Taufel   : 54  
##  Nitin Menon          : 50   RJ Tucker    : 53  
##  (Other)              :688   (Other)      :730

#1.a)Categorical Summary, In the dataset, lets take city and winner columns as they are categorical and create the categorical summary as required.

cat_summary <- data %>%
  summarise(
    # For 'city' column
    city_Values = list(unique(city)),
    city_Counts = list(table(city)),
    
    # For 'winner' column
    winner_Values = list(unique(winner)),
    winner_Counts = list(table(winner))
  )
print("Categorical Summary:")          # To print the text
## [1] "Categorical Summary:"
print(cat_summary)                     # To print the variable
##                                                                                                                                 city_Values
## 1 3, 8, 11, 27, 24, 19, 17, 9, 6, 30, 14, 7, 15, 20, 22, 5, 2, 10, 28, 12, 23, 18, 36, 31, 32, 34, 1, NA, 33, 21, 4, 13, 35, 29, 25, 16, 26
##                                                                                                                       city_Counts
## 1 37, 36, 65, 29, 2, 7, 12, 61, 85, 7, 90, 13, 13, 15, 3, 3, 77, 9, 57, 8, 4, 3, 5, 93, 14, 5, 173, 3, 9, 7, 51, 6, 10, 7, 10, 15
##                                                           winner_Values
## 1 9, 1, 4, 17, 14, 7, 2, 11, 12, 8, NA, 19, 16, 5, 15, 3, 13, 6, 10, 18
##                                                                  winner_Counts
## 1 138, 29, 48, 67, 13, 28, 88, 6, 131, 24, 144, 12, 24, 112, 10, 5, 116, 7, 88

#1.b)Numeric Summery, Lets take result_margin and target as the numeric columns and include the computational values as per our request.

numeric_summary <- data %>%
  summarise(
    # For 'result_margin' column
    result_margin_Min = min(result_margin, na.rm = TRUE),
    result_margin_Max = max(result_margin, na.rm = TRUE),
    result_margin_Mean = mean(result_margin, na.rm = TRUE),
    result_margin_Median = median(result_margin, na.rm = TRUE),
    result_margin_SD = sd(result_margin, na.rm = TRUE),
    result_margin_Q1 = quantile(result_margin, 0.25, na.rm = TRUE),
    result_margin_Q3 = quantile(result_margin, 0.75, na.rm = TRUE),
    
    # For 'target_runs' column
    target_runs_Min = min(target_runs, na.rm = TRUE),
    target_runs_Max = max(target_runs, na.rm = TRUE),
    target_runs_Mean = mean(target_runs, na.rm = TRUE),
    target_runs_Median = median(target_runs, na.rm = TRUE),
    target_runs_SD = sd(target_runs, na.rm = TRUE),
    target_runs_Q1 = quantile(target_runs, 0.25, na.rm = TRUE),
    target_runs_Q3 = quantile(target_runs, 0.75, na.rm = TRUE)
  )

print("Numeric_Summary:")
## [1] "Numeric_Summary:"
print(numeric_summary)
##   result_margin_Min result_margin_Max result_margin_Mean result_margin_Median
## 1                 1               146           17.25929                    8
##   result_margin_SD result_margin_Q1 result_margin_Q3 target_runs_Min
## 1         21.78744                6               20              43
##   target_runs_Max target_runs_Mean target_runs_Median target_runs_SD
## 1             288         165.6841                166       33.42705
##   target_runs_Q1 target_runs_Q3
## 1            146            187

#1.c)Combined summary, Lets create a combined summmary for both the above summaries.

combined_summary <- list(Categorical = cat_summary, Numeric = numeric_summary)
print("Combined Summary:")
## [1] "Combined Summary:"
print(combined_summary)
## $Categorical
##                                                                                                                                 city_Values
## 1 3, 8, 11, 27, 24, 19, 17, 9, 6, 30, 14, 7, 15, 20, 22, 5, 2, 10, 28, 12, 23, 18, 36, 31, 32, 34, 1, NA, 33, 21, 4, 13, 35, 29, 25, 16, 26
##                                                                                                                       city_Counts
## 1 37, 36, 65, 29, 2, 7, 12, 61, 85, 7, 90, 13, 13, 15, 3, 3, 77, 9, 57, 8, 4, 3, 5, 93, 14, 5, 173, 3, 9, 7, 51, 6, 10, 7, 10, 15
##                                                           winner_Values
## 1 9, 1, 4, 17, 14, 7, 2, 11, 12, 8, NA, 19, 16, 5, 15, 3, 13, 6, 10, 18
##                                                                  winner_Counts
## 1 138, 29, 48, 67, 13, 28, 88, 6, 131, 24, 144, 12, 24, 112, 10, 5, 116, 7, 88
## 
## $Numeric
##   result_margin_Min result_margin_Max result_margin_Mean result_margin_Median
## 1                 1               146           17.25929                    8
##   result_margin_SD result_margin_Q1 result_margin_Q3 target_runs_Min
## 1         21.78744                6               20              43
##   target_runs_Max target_runs_Mean target_runs_Median target_runs_SD
## 1             288         165.6841                166       33.42705
##   target_runs_Q1 target_runs_Q3
## 1            146            187

#Some columns has variable values, system doesn’t know which value to consider. So it might create some errors in calculations. #For example for the first 59 rows, the season is referred as 2007/08. But we don’t know which year to consider.

#2.a)In the Numeric summary, what is the exact requirement of target runs to win a match? #target runs mean doesn’t always give the exact value as there are some matches in which result has not been given. #2.b)What is the result_margin for matches that ended in tie? #Data documentation has missed to inform that there are NULL values present in some columns. #2.c)My project’s goal is to muatate the dataset as per requirements. And gain the insights of the ipl matches summary from 2008 to 2024. #For missing values in the data set, we can use is.na function and print missing values and detect the entities that are less important to our dataset, and clean them to get tidy version of the dataset.

#3.Aggregation function to compute mean result_margin by toss_winner

Aggr_mean <- data %>%
  group_by(toss_winner) %>%
  summarise(Mean= mean(result_margin, na.rm = TRUE))
print("Aggragate Mean of toss winner by result margin")
## [1] "Aggragate Mean of toss winner by result margin"
print(Aggr_mean)
## # A tibble: 19 × 2
##    toss_winner                  Mean
##    <fct>                       <dbl>
##  1 Chennai Super Kings          18.6
##  2 Deccan Chargers              14.2
##  3 Delhi Capitals               20.2
##  4 Delhi Daredevils             18.7
##  5 Gujarat Lions                19  
##  6 Gujarat Titans               12.2
##  7 Kings XI Punjab              17.3
##  8 Kochi Tuskers Kerala         13  
##  9 Kolkata Knight Riders        14.7
## 10 Lucknow Super Giants         17.3
## 11 Mumbai Indians               18.2
## 12 Pune Warriors                21.8
## 13 Punjab Kings                 12.9
## 14 Rajasthan Royals             15.5
## 15 Rising Pune Supergiant       21.3
## 16 Rising Pune Supergiants      13.4
## 17 Royal Challengers Bangalore  21.6
## 18 Royal Challengers Bengaluru  14  
## 19 Sunrisers Hyderabad          14.7

#4.Visual summaries (i.e., visualizations) of 2 or more columns of your data #Lets find distribution of target runs over the matches.

ggplot(data, aes(x = target_runs)) +
geom_histogram(binwidth = 5, fill = "red", color = "black") +
  labs(title = "Frequency of target runs distribution", x = "target runs", y = "Frequency of runs")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

#From the data we can say that in most of the times, target runs are in the range of 150 to 170

#Distribution of winners by season in boxplot

ggplot(data, aes(x = winner, y = season, fill = winner)) +
  geom_boxplot() +
  labs(title = "Box Plot of winners by season",
       x = "Winner", y = "Season") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#From this visualization, we can tell the number of times a team has won in an particular season #Distribution of categorical(winner) vs continuous(result_margin) variables

ggplot(data, aes(x = winner, y = result_margin, color = winner)) +
  geom_point(size = 2) +
  labs(title = "Scatter Plot of winner by result margin",
       x = "winner", y = "result margin") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).

#From this visualization, we can say that there is a trend of matches won by LSG and RCB are with highest result margin. #By using these visualizations, we met our requirements.