#import the dataset to the console
data <- read.csv("~/Documents/Rdocs/matches.csv", stringsAsFactors = TRUE)
#import most used libraries.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#head() function is used to give user an overview of the dataset, it gives some of the rows for every vector.
head(data)
## id season city date match_type player_of_match
## 1 335982 2007/08 Bangalore 2008-04-18 League BB McCullum
## 2 335983 2007/08 Chandigarh 2008-04-19 League MEK Hussey
## 3 335984 2007/08 Delhi 2008-04-19 League MF Maharoof
## 4 335985 2007/08 Mumbai 2008-04-20 League MV Boucher
## 5 335986 2007/08 Kolkata 2008-04-20 League DJ Hussey
## 6 335987 2007/08 Jaipur 2008-04-21 League SR Watson
## venue team1
## 1 M Chinnaswamy Stadium Royal Challengers Bangalore
## 2 Punjab Cricket Association Stadium, Mohali Kings XI Punjab
## 3 Feroz Shah Kotla Delhi Daredevils
## 4 Wankhede Stadium Mumbai Indians
## 5 Eden Gardens Kolkata Knight Riders
## 6 Sawai Mansingh Stadium Rajasthan Royals
## team2 toss_winner toss_decision
## 1 Kolkata Knight Riders Royal Challengers Bangalore field
## 2 Chennai Super Kings Chennai Super Kings bat
## 3 Rajasthan Royals Rajasthan Royals bat
## 4 Royal Challengers Bangalore Mumbai Indians bat
## 5 Deccan Chargers Deccan Chargers bat
## 6 Kings XI Punjab Kings XI Punjab bat
## winner result result_margin target_runs target_overs
## 1 Kolkata Knight Riders runs 140 223 20
## 2 Chennai Super Kings runs 33 241 20
## 3 Delhi Daredevils wickets 9 130 20
## 4 Royal Challengers Bangalore wickets 5 166 20
## 5 Kolkata Knight Riders wickets 5 111 20
## 6 Rajasthan Royals wickets 6 167 20
## super_over method umpire1 umpire2
## 1 N <NA> Asad Rauf RE Koertzen
## 2 N <NA> MR Benson SL Shastri
## 3 N <NA> Aleem Dar GA Pratapkumar
## 4 N <NA> SJ Davis DJ Harper
## 5 N <NA> BF Bowden K Hariharan
## 6 N <NA> Aleem Dar RB Tiffin
#Summary function is used to give insights and computations for each vector. For numerical categories it gives min,max and mean,median,mode etc.
summary(data)
## id season city date
## Min. : 335982 2013 : 76 Mumbai :173 2008-04-19: 2
## 1st Qu.: 548332 2012 : 74 Kolkata : 93 2008-04-20: 2
## Median : 980961 2022 : 74 Delhi : 90 2008-04-26: 2
## Mean : 904828 2023 : 74 Chennai : 85 2008-04-27: 2
## 3rd Qu.:1254062 2011 : 73 Hyderabad: 77 2008-05-01: 2
## Max. :1426312 2024 : 71 (Other) :526 2008-05-03: 2
## (Other):653 NA's : 51 (Other) :1083
## match_type player_of_match
## League :1029 AB de Villiers: 25
## Final : 17 CH Gayle : 22
## Qualifier 1: 14 RG Sharma : 19
## Qualifier 2: 14 DA Warner : 18
## Eliminator : 11 V Kohli : 18
## Semi Final : 6 (Other) :988
## (Other) : 4 NA's : 5
## venue
## Eden Gardens : 77
## Wankhede Stadium : 73
## M Chinnaswamy Stadium : 65
## Feroz Shah Kotla : 60
## Rajiv Gandhi International Stadium, Uppal: 49
## MA Chidambaram Stadium, Chepauk : 48
## (Other) :723
## team1 team2
## Royal Challengers Bangalore:135 Mumbai Indians :138
## Chennai Super Kings :128 Kolkata Knight Riders :130
## Mumbai Indians :123 Rajasthan Royals :120
## Kolkata Knight Riders :121 Chennai Super Kings :110
## Rajasthan Royals :101 Royal Challengers Bangalore:105
## Kings XI Punjab : 92 Kings XI Punjab : 98
## (Other) :395 (Other) :394
## toss_winner toss_decision
## Mumbai Indians :143 bat :391
## Chennai Super Kings :122 field:704
## Kolkata Knight Riders :122
## Rajasthan Royals :120
## Royal Challengers Bangalore:113
## Sunrisers Hyderabad : 88
## (Other) :387
## winner result result_margin
## Mumbai Indians :144 no result: 5 Min. : 1.00
## Chennai Super Kings :138 runs :498 1st Qu.: 6.00
## Kolkata Knight Riders :131 tie : 14 Median : 8.00
## Royal Challengers Bangalore:116 wickets :578 Mean : 17.26
## Rajasthan Royals :112 3rd Qu.: 20.00
## (Other) :449 Max. :146.00
## NA's : 5 NA's :19
## target_runs target_overs super_over method
## Min. : 43.0 Min. : 5.00 N:1081 D/L : 21
## 1st Qu.:146.0 1st Qu.:20.00 Y: 14 NA's:1074
## Median :166.0 Median :20.00
## Mean :165.7 Mean :19.76
## 3rd Qu.:187.0 3rd Qu.:20.00
## Max. :288.0 Max. :20.00
## NA's :3 NA's :3
## umpire1 umpire2
## AK Chaudhary :115 S Ravi : 83
## HDPK Dharmasena : 79 VK Sharma : 61
## KN Ananthapadmanabhan: 59 C Shamshuddin: 60
## CB Gaffaney : 53 Nitin Menon : 54
## Asad Rauf : 51 SJA Taufel : 54
## Nitin Menon : 50 RJ Tucker : 53
## (Other) :688 (Other) :730
#1.a)Categorical Summary, In the dataset, lets take city and winner columns as they are categorical and create the categorical summary as required.
cat_summary <- data %>%
summarise(
# For 'city' column
city_Values = list(unique(city)),
city_Counts = list(table(city)),
# For 'winner' column
winner_Values = list(unique(winner)),
winner_Counts = list(table(winner))
)
print("Categorical Summary:") # To print the text
## [1] "Categorical Summary:"
print(cat_summary) # To print the variable
## city_Values
## 1 3, 8, 11, 27, 24, 19, 17, 9, 6, 30, 14, 7, 15, 20, 22, 5, 2, 10, 28, 12, 23, 18, 36, 31, 32, 34, 1, NA, 33, 21, 4, 13, 35, 29, 25, 16, 26
## city_Counts
## 1 37, 36, 65, 29, 2, 7, 12, 61, 85, 7, 90, 13, 13, 15, 3, 3, 77, 9, 57, 8, 4, 3, 5, 93, 14, 5, 173, 3, 9, 7, 51, 6, 10, 7, 10, 15
## winner_Values
## 1 9, 1, 4, 17, 14, 7, 2, 11, 12, 8, NA, 19, 16, 5, 15, 3, 13, 6, 10, 18
## winner_Counts
## 1 138, 29, 48, 67, 13, 28, 88, 6, 131, 24, 144, 12, 24, 112, 10, 5, 116, 7, 88
#1.b)Numeric Summery, Lets take result_margin and target as the numeric columns and include the computational values as per our request.
numeric_summary <- data %>%
summarise(
# For 'result_margin' column
result_margin_Min = min(result_margin, na.rm = TRUE),
result_margin_Max = max(result_margin, na.rm = TRUE),
result_margin_Mean = mean(result_margin, na.rm = TRUE),
result_margin_Median = median(result_margin, na.rm = TRUE),
result_margin_SD = sd(result_margin, na.rm = TRUE),
result_margin_Q1 = quantile(result_margin, 0.25, na.rm = TRUE),
result_margin_Q3 = quantile(result_margin, 0.75, na.rm = TRUE),
# For 'target_runs' column
target_runs_Min = min(target_runs, na.rm = TRUE),
target_runs_Max = max(target_runs, na.rm = TRUE),
target_runs_Mean = mean(target_runs, na.rm = TRUE),
target_runs_Median = median(target_runs, na.rm = TRUE),
target_runs_SD = sd(target_runs, na.rm = TRUE),
target_runs_Q1 = quantile(target_runs, 0.25, na.rm = TRUE),
target_runs_Q3 = quantile(target_runs, 0.75, na.rm = TRUE)
)
print("Numeric_Summary:")
## [1] "Numeric_Summary:"
print(numeric_summary)
## result_margin_Min result_margin_Max result_margin_Mean result_margin_Median
## 1 1 146 17.25929 8
## result_margin_SD result_margin_Q1 result_margin_Q3 target_runs_Min
## 1 21.78744 6 20 43
## target_runs_Max target_runs_Mean target_runs_Median target_runs_SD
## 1 288 165.6841 166 33.42705
## target_runs_Q1 target_runs_Q3
## 1 146 187
#1.c)Combined summary, Lets create a combined summmary for both the above summaries.
combined_summary <- list(Categorical = cat_summary, Numeric = numeric_summary)
print("Combined Summary:")
## [1] "Combined Summary:"
print(combined_summary)
## $Categorical
## city_Values
## 1 3, 8, 11, 27, 24, 19, 17, 9, 6, 30, 14, 7, 15, 20, 22, 5, 2, 10, 28, 12, 23, 18, 36, 31, 32, 34, 1, NA, 33, 21, 4, 13, 35, 29, 25, 16, 26
## city_Counts
## 1 37, 36, 65, 29, 2, 7, 12, 61, 85, 7, 90, 13, 13, 15, 3, 3, 77, 9, 57, 8, 4, 3, 5, 93, 14, 5, 173, 3, 9, 7, 51, 6, 10, 7, 10, 15
## winner_Values
## 1 9, 1, 4, 17, 14, 7, 2, 11, 12, 8, NA, 19, 16, 5, 15, 3, 13, 6, 10, 18
## winner_Counts
## 1 138, 29, 48, 67, 13, 28, 88, 6, 131, 24, 144, 12, 24, 112, 10, 5, 116, 7, 88
##
## $Numeric
## result_margin_Min result_margin_Max result_margin_Mean result_margin_Median
## 1 1 146 17.25929 8
## result_margin_SD result_margin_Q1 result_margin_Q3 target_runs_Min
## 1 21.78744 6 20 43
## target_runs_Max target_runs_Mean target_runs_Median target_runs_SD
## 1 288 165.6841 166 33.42705
## target_runs_Q1 target_runs_Q3
## 1 146 187
#Some columns has variable values, system doesn’t know which value to consider. So it might create some errors in calculations. #For example for the first 59 rows, the season is referred as 2007/08. But we don’t know which year to consider.
#2.a)In the Numeric summary, what is the exact requirement of target runs to win a match? #target runs mean doesn’t always give the exact value as there are some matches in which result has not been given. #2.b)What is the result_margin for matches that ended in tie? #Data documentation has missed to inform that there are NULL values present in some columns. #2.c)My project’s goal is to muatate the dataset as per requirements. And gain the insights of the ipl matches summary from 2008 to 2024. #For missing values in the data set, we can use is.na function and print missing values and detect the entities that are less important to our dataset, and clean them to get tidy version of the dataset.
#3.Aggregation function to compute mean result_margin by toss_winner
Aggr_mean <- data %>%
group_by(toss_winner) %>%
summarise(Mean= mean(result_margin, na.rm = TRUE))
print("Aggragate Mean of toss winner by result margin")
## [1] "Aggragate Mean of toss winner by result margin"
print(Aggr_mean)
## # A tibble: 19 × 2
## toss_winner Mean
## <fct> <dbl>
## 1 Chennai Super Kings 18.6
## 2 Deccan Chargers 14.2
## 3 Delhi Capitals 20.2
## 4 Delhi Daredevils 18.7
## 5 Gujarat Lions 19
## 6 Gujarat Titans 12.2
## 7 Kings XI Punjab 17.3
## 8 Kochi Tuskers Kerala 13
## 9 Kolkata Knight Riders 14.7
## 10 Lucknow Super Giants 17.3
## 11 Mumbai Indians 18.2
## 12 Pune Warriors 21.8
## 13 Punjab Kings 12.9
## 14 Rajasthan Royals 15.5
## 15 Rising Pune Supergiant 21.3
## 16 Rising Pune Supergiants 13.4
## 17 Royal Challengers Bangalore 21.6
## 18 Royal Challengers Bengaluru 14
## 19 Sunrisers Hyderabad 14.7
#4.Visual summaries (i.e., visualizations) of 2 or more columns of your data #Lets find distribution of target runs over the matches.
ggplot(data, aes(x = target_runs)) +
geom_histogram(binwidth = 5, fill = "red", color = "black") +
labs(title = "Frequency of target runs distribution", x = "target runs", y = "Frequency of runs")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
#From the data we can say that in most of the times, target runs are in the range of 150 to 170
#Distribution of winners by season in boxplot
ggplot(data, aes(x = winner, y = season, fill = winner)) +
geom_boxplot() +
labs(title = "Box Plot of winners by season",
x = "Winner", y = "Season") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#From this visualization, we can tell the number of times a team has won in an particular season #Distribution of categorical(winner) vs continuous(result_margin) variables
ggplot(data, aes(x = winner, y = result_margin, color = winner)) +
geom_point(size = 2) +
labs(title = "Scatter Plot of winner by result margin",
x = "winner", y = "result margin") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
#From this visualization, we can say that there is a trend of matches
won by LSG and RCB are with highest result margin. #By using these
visualizations, we met our requirements.