library(readr)
ipl <- read_csv("C:/Users/ASUS/Downloads/ipl.zip")
## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): city, date, team1, team2, toss_winner, toss_decision, result, winn...
## dbl (5): id, season, dl_applied, win_by_runs, win_by_wickets
## lgl (1): umpire3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load the required library
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
# Inspect the data
print(head(ipl))
## # A tibble: 6 × 18
## id season city date team1 team2 toss_winner toss_decision result
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 2017 Hyderabad 5/4/2017 Sunris… Roya… Royal Chal… field normal
## 2 2 2017 Pune 6/4/2017 Mumbai… Risi… Rising Pun… field normal
## 3 3 2017 Rajkot 7/4/2017 Gujara… Kolk… Kolkata Kn… field normal
## 4 4 2017 Indore 8/4/2017 Rising… King… Kings XI P… field normal
## 5 5 2017 Bangalore 8/4/2017 Royal … Delh… Royal Chal… bat normal
## 6 6 2017 Hyderabad 9/4/2017 Gujara… Sunr… Sunrisers … field normal
## # ℹ 9 more variables: dl_applied <dbl>, winner <chr>, win_by_runs <dbl>,
## # win_by_wickets <dbl>, player_of_match <chr>, venue <chr>, umpire1 <chr>,
## # umpire2 <chr>, umpire3 <lgl>
str(ipl)
## spc_tbl_ [636 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:636] 1 2 3 4 5 6 7 8 9 10 ...
## $ season : num [1:636] 2017 2017 2017 2017 2017 ...
## $ city : chr [1:636] "Hyderabad" "Pune" "Rajkot" "Indore" ...
## $ date : chr [1:636] "5/4/2017" "6/4/2017" "7/4/2017" "8/4/2017" ...
## $ team1 : chr [1:636] "Sunrisers Hyderabad" "Mumbai Indians" "Gujarat Lions" "Rising Pune Supergiant" ...
## $ team2 : chr [1:636] "Royal Challengers Bangalore" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
## $ toss_winner : chr [1:636] "Royal Challengers Bangalore" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
## $ toss_decision : chr [1:636] "field" "field" "field" "field" ...
## $ result : chr [1:636] "normal" "normal" "normal" "normal" ...
## $ dl_applied : num [1:636] 0 0 0 0 0 0 0 0 0 0 ...
## $ winner : chr [1:636] "Sunrisers Hyderabad" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
## $ win_by_runs : num [1:636] 35 0 0 0 15 0 0 0 97 0 ...
## $ win_by_wickets : num [1:636] 0 7 10 6 0 9 4 8 0 4 ...
## $ player_of_match: chr [1:636] "Yuvraj Singh" "SPD Smith" "CA Lynn" "GJ Maxwell" ...
## $ venue : chr [1:636] "Rajiv Gandhi International Stadium, Uppal" "Maharashtra Cricket Association Stadium" "Saurashtra Cricket Association Stadium" "Holkar Cricket Stadium" ...
## $ umpire1 : chr [1:636] "AY Dandekar" "A Nand Kishore" "Nitin Menon" "AK Chaudhary" ...
## $ umpire2 : chr [1:636] "NJ Llong" "S Ravi" "CK Nandan" "C Shamshuddin" ...
## $ umpire3 : logi [1:636] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. season = col_double(),
## .. city = col_character(),
## .. date = col_character(),
## .. team1 = col_character(),
## .. team2 = col_character(),
## .. toss_winner = col_character(),
## .. toss_decision = col_character(),
## .. result = col_character(),
## .. dl_applied = col_double(),
## .. winner = col_character(),
## .. win_by_runs = col_double(),
## .. win_by_wickets = col_double(),
## .. player_of_match = col_character(),
## .. venue = col_character(),
## .. umpire1 = col_character(),
## .. umpire2 = col_character(),
## .. umpire3 = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
summary(ipl)
## id season city date
## Min. : 1.0 Min. :2008 Length:636 Length:636
## 1st Qu.:159.8 1st Qu.:2010 Class :character Class :character
## Median :318.5 Median :2012 Mode :character Mode :character
## Mean :318.5 Mean :2012
## 3rd Qu.:477.2 3rd Qu.:2015
## Max. :636.0 Max. :2017
## team1 team2 toss_winner toss_decision
## Length:636 Length:636 Length:636 Length:636
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## result dl_applied winner win_by_runs
## Length:636 Min. :0.00000 Length:636 Min. : 0.00
## Class :character 1st Qu.:0.00000 Class :character 1st Qu.: 0.00
## Mode :character Median :0.00000 Mode :character Median : 0.00
## Mean :0.02516 Mean : 13.68
## 3rd Qu.:0.00000 3rd Qu.: 20.00
## Max. :1.00000 Max. :146.00
## win_by_wickets player_of_match venue umpire1
## Min. : 0.000 Length:636 Length:636 Length:636
## 1st Qu.: 0.000 Class :character Class :character Class :character
## Median : 4.000 Mode :character Mode :character Mode :character
## Mean : 3.373
## 3rd Qu.: 7.000
## Max. :10.000
## umpire2 umpire3
## Length:636 Mode:logical
## Class :character NA's:636
## Mode :character
##
##
##
# Handle missing values
ipl$city <- ifelse(is.na(ipl$city), 'Unknown', ipl$city) # Replace missing 'city' with 'Unknown'
# Convert data types
ipl$date <- as.Date(ipl$date, format="%Y-%m-%d")
ipl$team1 <- as.factor(ipl$team1)
ipl$team2 <- as.factor(ipl$team2)
# Ensure that factors have more than one level
print(length(unique(ipl$team1)))
## [1] 14
print(length(unique(ipl$team2)))
## [1] 14
# Encode categorical data using one-hot encoding, ensure there are at least two levels in each factor
if(length(unique(ipl$team1)) > 1 && length(unique(ipl$team2)) > 1) {
ipl <- cbind(ipl, model.matrix(~team1 + team2 - 1, data=ipl))
} else {
warning("Not enough levels to create dummy variables for teams.")
}
# Feature Engineering
ipl$is_derby <- as.integer(ipl$team1 == ipl$team2)
# Normalize or standardize numerical data
ipl$win_by_runs <- scale(ipl$win_by_runs)
ipl$win_by_wickets <- scale(ipl$win_by_wickets)
# Remove unnecessary columns
ipl <- select(ipl, -c(umpire1, umpire2, umpire3))
##Q1:- Number of matches played per season?
# Load the required library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Create a bar plot for the 'season' column
ggplot(ipl, aes(x = factor(season), fill = factor(season))) +
geom_bar(stat = "count", color = "black", show.legend = FALSE) + # Draw bars and outline them in black
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5, position = position_stack(vjust = 0.5)) +
labs(title = "Count of Matches per Season", x = "Season", y = "Count") +
scale_fill_viridis_d() + # Use the Viridis color scale
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Q2:- number of matches ib each venue?
# Load the required library
library(ggplot2)
# Create a bar plot for the 'venue' column
ggplot(ipl, aes(x = venue)) +
geom_bar(stat = "count", fill = "steelblue") + # Fill color can be adjusted
theme_minimal() +
labs(title = "Match Count per Venue", x = "Venue", y = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

#Q3:-Number of matches played at each city?
# Load the required library
library(ggplot2)
# Create a countplot for the 'city' column
p <- ggplot(ipl, aes(x = city, fill = city)) +
geom_bar(color = "black") + # Outline the bars in black for better distinction
scale_fill_viridis_d() + # Apply a color scale for visual appeal and distinction
theme_minimal() +
labs(title = "Match Count per City", x = "City", y = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels vertically
# Print the plot
print(p)

#Q4:-Who are the top 10 players with the most 'Player of the Match' awards?
# Load necessary libraries
library(dplyr)
# Analyze the 'Player of the Match' awards
top_players <- ipl %>%
filter(!is.na(player_of_match)) %>% # Filter out any NA values in 'player_of_match'
group_by(player_of_match) %>%
summarise(awards_count = n()) %>%
arrange(desc(awards_count)) %>%
top_n(10) # Select the top 10 players with the most awards
## Selecting by awards_count
# Print the results
print(top_players)
## # A tibble: 10 × 2
## player_of_match awards_count
## <chr> <int>
## 1 CH Gayle 18
## 2 YK Pathan 16
## 3 AB de Villiers 15
## 4 DA Warner 15
## 5 RG Sharma 14
## 6 SK Raina 14
## 7 G Gambhir 13
## 8 MS Dhoni 13
## 9 AM Rahane 12
## 10 MEK Hussey 12
#Q5:-What is the win percentage for teams when they win the toss?
# Load necessary library
library(dplyr)
# Calculate win percentage for toss winners, handling NA values
toss_win_effect <- ipl %>%
mutate(win_toss_and_match = ifelse(toss_winner == winner, 1, 0)) %>%
drop_na(toss_winner, winner) %>% # Drop rows where 'toss_winner' or 'winner' are NA
group_by(toss_winner) %>%
summarise(Total_Toss_Wins = n(),
Wins_Post_Toss_Win = sum(win_toss_and_match, na.rm = TRUE),
Win_Percentage = Wins_Post_Toss_Win / Total_Toss_Wins * 100) %>%
arrange(desc(Win_Percentage))
# Print the results
print(toss_win_effect)
## # A tibble: 14 × 4
## toss_winner Total_Toss_Wins Wins_Post_Toss_Win Win_Percentage
## <chr> <int> <dbl> <dbl>
## 1 Rising Pune Supergiant 6 5 83.3
## 2 Gujarat Lions 15 10 66.7
## 3 Chennai Super Kings 66 42 63.6
## 4 Mumbai Indians 85 48 56.5
## 5 Kolkata Knight Riders 78 44 56.4
## 6 Rajasthan Royals 62 34 54.8
## 7 Royal Challengers Bangalore 69 35 50.7
## 8 Kochi Tuskers Kerala 8 4 50
## 9 Sunrisers Hyderabad 35 17 48.6
## 10 Delhi Daredevils 71 33 46.5
## 11 Deccan Chargers 43 19 44.2
## 12 Rising Pune Supergiants 7 3 42.9
## 13 Kings XI Punjab 68 28 41.2
## 14 Pune Warriors 20 3 15
#Q6:-Selections made after winning the toss?
# Load necessary library
library(dplyr)
# Calculate toss decision percentages
toss_decision_counts <- table(ipl$toss_decision)
toss_decision_percentages <- prop.table(toss_decision_counts) * 100
# Define colors
colors <- c("violet", "red")
# Labels with percentage
labels <- paste(names(toss_decision_percentages),
sprintf("%.1f%%", toss_decision_percentages))
# Create a pie chart
pie(toss_decision_percentages, labels = labels,
col = colors, main = "Toss Decision Percentage",
init.angle = 90, clockwise = TRUE)

#Q7:-As we have seen above the toss decision does it impact the match result ?
# Calculate the number of wins and losses when batting second
no_of_wins <- sum(ipl$win_by_wickets > 0)
no_of_loss <- sum(ipl$win_by_wickets == 0)
# Labels for the pie chart
labels <- c("Wins", "Loss")
# Total matches calculated for percentage computation
total <- no_of_wins + no_of_loss
sizes <- c((no_of_wins / total) * 100, (no_of_loss / total) * 100)
# Define colors for the pie slices
colors <- c('red', 'green')
# Create labels with percentages for the pie chart
percentage_labels <- sprintf("%s %.1f%%", labels, sizes)
# Create the pie chart
pie(sizes, labels = percentage_labels, col = colors, main = "Win Percentage Batting Second",
init.angle = 90, clockwise = TRUE)

#Q8:-maximum toss winners?
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Prepare the data
toss_winner_counts <- ipl %>%
count(toss_winner) %>%
arrange(desc(n)) # Sorting by count to display the most frequent toss winners at the top
# Plotting the data using ggplot2
ggplot(toss_winner_counts, aes(x = n, y = reorder(toss_winner, n), fill = toss_winner)) + # Adding fill based on toss_winner
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1") + # Using a ColorBrewer palette
labs(title = "Frequency of Toss Wins by Team",
x = "Count of Toss Wins",
y = "Team") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

#Q9:-player of the match as per the number of matches?
# Analyze the impact of Player of the Match on winning
player_of_match_influence <- ipl %>%
mutate(is_winner = ifelse(winner == team1 | winner == team2, winner, NA)) %>%
group_by(player_of_match) %>%
summarise(matches_won = sum(player_of_match == is_winner, na.rm = TRUE),
total_awards = n(),
win_percentage = matches_won / total_awards * 100) %>%
filter(total_awards > 10) %>% # Consider only players with more than 10 awards for a more robust analysis
arrange(desc(win_percentage))
# View the results
print(player_of_match_influence)
## # A tibble: 13 × 4
## player_of_match matches_won total_awards win_percentage
## <chr> <int> <int> <dbl>
## 1 AB de Villiers 0 15 0
## 2 AM Rahane 0 12 0
## 3 CH Gayle 0 18 0
## 4 DA Warner 0 15 0
## 5 DR Smith 0 11 0
## 6 G Gambhir 0 13 0
## 7 MEK Hussey 0 12 0
## 8 MS Dhoni 0 13 0
## 9 RG Sharma 0 14 0
## 10 SK Raina 0 14 0
## 11 V Kohli 0 11 0
## 12 V Sehwag 0 11 0
## 13 YK Pathan 0 16 0
#Q10:-performance of teams won by runs ?team which bated first!!
# Load necessary library
library(dplyr)
library(ggplot2)
# Calculate the median win_by_runs for each team that won by runs, and sort the result
median_wins_by_runs <- ipl %>%
filter(win_by_runs > 0) %>%
group_by(winner) %>%
summarise(median_runs = median(win_by_runs)) %>%
arrange(desc(median_runs))
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the results
print(median_wins_by_runs)
## # A tibble: 13 × 2
## winner median_runs
## <chr> <dbl>
## 1 Rajasthan Royals 1.23
## 2 Royal Challengers Bangalore 0.996
## 3 Delhi Daredevils 0.975
## 4 Chennai Super Kings 0.913
## 5 Deccan Chargers 0.724
## 6 Kolkata Knight Riders 0.724
## 7 Mumbai Indians 0.641
## 8 Kings XI Punjab 0.557
## 9 Rising Pune Supergiant 0.557
## 10 Rising Pune Supergiants 0.536
## 11 Pune Warriors 0.432
## 12 Sunrisers Hyderabad 0.348
## 13 Kochi Tuskers Kerala 0.139
# Filter data to include only matches won by runs
data_won_by_runs <- ipl %>%
filter(win_by_runs > 0)
# Create a horizontal boxplot to display win by runs for each winning team
ggplot(data_won_by_runs, aes(y = winner, x = win_by_runs)) +
geom_boxplot() +
labs(title = "Winning by Runs - Team Performance", x = "Win by Runs", y = "Winner") +
theme_minimal() +
theme(axis.title.y = element_blank())

#Q11:-performance of team won by wickets?batting second!!
# Load necessary library
library(dplyr)
library(ggplot2)
# Calculate the median of 'win_by_wickets' for each team that won by wickets, and sort the result
median_wins_by_wickets <- ipl %>%
filter(win_by_wickets > 0) %>%
group_by(winner) %>%
summarise(median_wickets = median(win_by_wickets, na.rm = TRUE)) %>%
arrange(desc(median_wickets))
# Print the results
print(median_wins_by_wickets)
## # A tibble: 14 × 2
## winner median_wickets
## <chr> <dbl>
## 1 Kochi Tuskers Kerala 1.21
## 2 Delhi Daredevils 1.06
## 3 Kolkata Knight Riders 1.06
## 4 Rajasthan Royals 1.06
## 5 Rising Pune Supergiants 1.06
## 6 Royal Challengers Bangalore 1.06
## 7 Sunrisers Hyderabad 1.06
## 8 Chennai Super Kings 0.914
## 9 Pune Warriors 0.914
## 10 Deccan Chargers 0.768
## 11 Gujarat Lions 0.768
## 12 Kings XI Punjab 0.768
## 13 Mumbai Indians 0.768
## 14 Rising Pune Supergiant 0.768
# Filter data to include only matches won by wickets
data_won_by_wickets <- ipl %>%
filter(win_by_wickets > 0)
ggplot(data_won_by_wickets, aes(y = winner, x = win_by_wickets, fill = winner)) +
geom_boxplot() +
labs(title = "Winning by Wickets - Team Performance", x = "Win by Wickets", y = "Winner") +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_text(size = 7), # Ensure text size is readable
legend.position = "none")

#Q12:-What is the correlation between winning the toss and choosing to bat or bowl?using corrplot?
# Load necessary libraries
library(dplyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
ipl_data <- ipl %>%
mutate(toss_winner_is_match_winner = as.integer(toss_winner == winner), # 1 if toss winner wins the game, else 0
choose_to_bat = as.integer(toss_decision == "bat")) # 1 if decision is to bat, 0 if bowl
# Calculate the correlation between winning the toss and choosing to bat
correlation_result <- cor(ipl_data$toss_winner_is_match_winner, ipl_data$choose_to_bat, method = "pearson", use = "complete.obs")
# Print the correlation result
print(correlation_result)
## [1] -0.09993806
correlation_matrix <- cor(ipl_data[, c("toss_winner_is_match_winner", "choose_to_bat")], use = "complete.obs")
# Visualizing the correlation matrix using corrplot
corrplot(correlation_matrix, method = "circle", type = "upper",
title = "Correlation between Toss Outcome and Match Decisions",
tl.col = "black", tl.cex = 0.8)
