library(readr)
ipl <- read_csv("C:/Users/ASUS/Downloads/ipl.zip")
## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): city, date, team1, team2, toss_winner, toss_decision, result, winn...
## dbl  (5): id, season, dl_applied, win_by_runs, win_by_wickets
## lgl  (1): umpire3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load the required library
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

# Inspect the data
print(head(ipl))
## # A tibble: 6 × 18
##      id season city      date     team1   team2 toss_winner toss_decision result
##   <dbl>  <dbl> <chr>     <chr>    <chr>   <chr> <chr>       <chr>         <chr> 
## 1     1   2017 Hyderabad 5/4/2017 Sunris… Roya… Royal Chal… field         normal
## 2     2   2017 Pune      6/4/2017 Mumbai… Risi… Rising Pun… field         normal
## 3     3   2017 Rajkot    7/4/2017 Gujara… Kolk… Kolkata Kn… field         normal
## 4     4   2017 Indore    8/4/2017 Rising… King… Kings XI P… field         normal
## 5     5   2017 Bangalore 8/4/2017 Royal … Delh… Royal Chal… bat           normal
## 6     6   2017 Hyderabad 9/4/2017 Gujara… Sunr… Sunrisers … field         normal
## # ℹ 9 more variables: dl_applied <dbl>, winner <chr>, win_by_runs <dbl>,
## #   win_by_wickets <dbl>, player_of_match <chr>, venue <chr>, umpire1 <chr>,
## #   umpire2 <chr>, umpire3 <lgl>
str(ipl)
## spc_tbl_ [636 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id             : num [1:636] 1 2 3 4 5 6 7 8 9 10 ...
##  $ season         : num [1:636] 2017 2017 2017 2017 2017 ...
##  $ city           : chr [1:636] "Hyderabad" "Pune" "Rajkot" "Indore" ...
##  $ date           : chr [1:636] "5/4/2017" "6/4/2017" "7/4/2017" "8/4/2017" ...
##  $ team1          : chr [1:636] "Sunrisers Hyderabad" "Mumbai Indians" "Gujarat Lions" "Rising Pune Supergiant" ...
##  $ team2          : chr [1:636] "Royal Challengers Bangalore" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
##  $ toss_winner    : chr [1:636] "Royal Challengers Bangalore" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
##  $ toss_decision  : chr [1:636] "field" "field" "field" "field" ...
##  $ result         : chr [1:636] "normal" "normal" "normal" "normal" ...
##  $ dl_applied     : num [1:636] 0 0 0 0 0 0 0 0 0 0 ...
##  $ winner         : chr [1:636] "Sunrisers Hyderabad" "Rising Pune Supergiant" "Kolkata Knight Riders" "Kings XI Punjab" ...
##  $ win_by_runs    : num [1:636] 35 0 0 0 15 0 0 0 97 0 ...
##  $ win_by_wickets : num [1:636] 0 7 10 6 0 9 4 8 0 4 ...
##  $ player_of_match: chr [1:636] "Yuvraj Singh" "SPD Smith" "CA Lynn" "GJ Maxwell" ...
##  $ venue          : chr [1:636] "Rajiv Gandhi International Stadium, Uppal" "Maharashtra Cricket Association Stadium" "Saurashtra Cricket Association Stadium" "Holkar Cricket Stadium" ...
##  $ umpire1        : chr [1:636] "AY Dandekar" "A Nand Kishore" "Nitin Menon" "AK Chaudhary" ...
##  $ umpire2        : chr [1:636] "NJ Llong" "S Ravi" "CK Nandan" "C Shamshuddin" ...
##  $ umpire3        : logi [1:636] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   season = col_double(),
##   ..   city = col_character(),
##   ..   date = col_character(),
##   ..   team1 = col_character(),
##   ..   team2 = col_character(),
##   ..   toss_winner = col_character(),
##   ..   toss_decision = col_character(),
##   ..   result = col_character(),
##   ..   dl_applied = col_double(),
##   ..   winner = col_character(),
##   ..   win_by_runs = col_double(),
##   ..   win_by_wickets = col_double(),
##   ..   player_of_match = col_character(),
##   ..   venue = col_character(),
##   ..   umpire1 = col_character(),
##   ..   umpire2 = col_character(),
##   ..   umpire3 = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(ipl)
##        id            season         city               date          
##  Min.   :  1.0   Min.   :2008   Length:636         Length:636        
##  1st Qu.:159.8   1st Qu.:2010   Class :character   Class :character  
##  Median :318.5   Median :2012   Mode  :character   Mode  :character  
##  Mean   :318.5   Mean   :2012                                        
##  3rd Qu.:477.2   3rd Qu.:2015                                        
##  Max.   :636.0   Max.   :2017                                        
##     team1              team2           toss_winner        toss_decision     
##  Length:636         Length:636         Length:636         Length:636        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     result            dl_applied         winner           win_by_runs    
##  Length:636         Min.   :0.00000   Length:636         Min.   :  0.00  
##  Class :character   1st Qu.:0.00000   Class :character   1st Qu.:  0.00  
##  Mode  :character   Median :0.00000   Mode  :character   Median :  0.00  
##                     Mean   :0.02516                      Mean   : 13.68  
##                     3rd Qu.:0.00000                      3rd Qu.: 20.00  
##                     Max.   :1.00000                      Max.   :146.00  
##  win_by_wickets   player_of_match       venue             umpire1         
##  Min.   : 0.000   Length:636         Length:636         Length:636        
##  1st Qu.: 0.000   Class :character   Class :character   Class :character  
##  Median : 4.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 3.373                                                           
##  3rd Qu.: 7.000                                                           
##  Max.   :10.000                                                           
##    umpire2          umpire3       
##  Length:636         Mode:logical  
##  Class :character   NA's:636      
##  Mode  :character                 
##                                   
##                                   
## 
# Handle missing values

ipl$city <- ifelse(is.na(ipl$city), 'Unknown', ipl$city)  # Replace missing 'city' with 'Unknown'

# Convert data types
ipl$date <- as.Date(ipl$date, format="%Y-%m-%d")
ipl$team1 <- as.factor(ipl$team1)
ipl$team2 <- as.factor(ipl$team2)

# Ensure that factors have more than one level
print(length(unique(ipl$team1)))
## [1] 14
print(length(unique(ipl$team2)))
## [1] 14
# Encode categorical data using one-hot encoding, ensure there are at least two levels in each factor
if(length(unique(ipl$team1)) > 1 && length(unique(ipl$team2)) > 1) {
  ipl <- cbind(ipl, model.matrix(~team1 + team2 - 1, data=ipl))
} else {
  warning("Not enough levels to create dummy variables for teams.")
}

# Feature Engineering
ipl$is_derby <- as.integer(ipl$team1 == ipl$team2)

# Normalize or standardize numerical data
ipl$win_by_runs <- scale(ipl$win_by_runs)
ipl$win_by_wickets <- scale(ipl$win_by_wickets)

# Remove unnecessary columns
ipl <- select(ipl, -c(umpire1, umpire2, umpire3))
##Q1:- Number of matches played per season?
# Load the required library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Create a bar plot for the 'season' column
ggplot(ipl, aes(x = factor(season), fill = factor(season))) +
  geom_bar(stat = "count", color = "black", show.legend = FALSE) +  # Draw bars and outline them in black
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5, position = position_stack(vjust = 0.5)) +
  labs(title = "Count of Matches per Season", x = "Season", y = "Count") +
  scale_fill_viridis_d() +  # Use the Viridis color scale
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Q2:- number of matches ib each venue?

# Load the required library
library(ggplot2)

# Create a bar plot for the 'venue' column
ggplot(ipl, aes(x = venue)) +
  geom_bar(stat = "count", fill = "steelblue") +  # Fill color can be adjusted
  theme_minimal() +
  labs(title = "Match Count per Venue", x = "Venue", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

#Q3:-Number of matches played at each city?

# Load the required library
library(ggplot2)

# Create a countplot for the 'city' column
p <- ggplot(ipl, aes(x = city, fill = city)) +
  geom_bar(color = "black") +  # Outline the bars in black for better distinction
  scale_fill_viridis_d() +  # Apply a color scale for visual appeal and distinction
  theme_minimal() +
  labs(title = "Match Count per City", x = "City", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # Rotate x-axis labels vertically

# Print the plot
print(p)

#Q4:-Who are the top 10 players with the most 'Player of the Match' awards?

# Load necessary libraries
library(dplyr)

# Analyze the 'Player of the Match' awards
top_players <- ipl %>%
  filter(!is.na(player_of_match)) %>%  # Filter out any NA values in 'player_of_match'
  group_by(player_of_match) %>%
  summarise(awards_count = n()) %>%
  arrange(desc(awards_count)) %>%
  top_n(10)  # Select the top 10 players with the most awards
## Selecting by awards_count
# Print the results
print(top_players)
## # A tibble: 10 × 2
##    player_of_match awards_count
##    <chr>                  <int>
##  1 CH Gayle                  18
##  2 YK Pathan                 16
##  3 AB de Villiers            15
##  4 DA Warner                 15
##  5 RG Sharma                 14
##  6 SK Raina                  14
##  7 G Gambhir                 13
##  8 MS Dhoni                  13
##  9 AM Rahane                 12
## 10 MEK Hussey                12
#Q5:-What is the win percentage for teams when they win the toss?

# Load necessary library
library(dplyr)

# Calculate win percentage for toss winners, handling NA values
toss_win_effect <- ipl %>%
  mutate(win_toss_and_match = ifelse(toss_winner == winner, 1, 0)) %>%
  drop_na(toss_winner, winner) %>%  # Drop rows where 'toss_winner' or 'winner' are NA
  group_by(toss_winner) %>%
  summarise(Total_Toss_Wins = n(),
            Wins_Post_Toss_Win = sum(win_toss_and_match, na.rm = TRUE),
            Win_Percentage = Wins_Post_Toss_Win / Total_Toss_Wins * 100) %>%
  arrange(desc(Win_Percentage))

# Print the results
print(toss_win_effect)
## # A tibble: 14 × 4
##    toss_winner                 Total_Toss_Wins Wins_Post_Toss_Win Win_Percentage
##    <chr>                                 <int>              <dbl>          <dbl>
##  1 Rising Pune Supergiant                    6                  5           83.3
##  2 Gujarat Lions                            15                 10           66.7
##  3 Chennai Super Kings                      66                 42           63.6
##  4 Mumbai Indians                           85                 48           56.5
##  5 Kolkata Knight Riders                    78                 44           56.4
##  6 Rajasthan Royals                         62                 34           54.8
##  7 Royal Challengers Bangalore              69                 35           50.7
##  8 Kochi Tuskers Kerala                      8                  4           50  
##  9 Sunrisers Hyderabad                      35                 17           48.6
## 10 Delhi Daredevils                         71                 33           46.5
## 11 Deccan Chargers                          43                 19           44.2
## 12 Rising Pune Supergiants                   7                  3           42.9
## 13 Kings XI Punjab                          68                 28           41.2
## 14 Pune Warriors                            20                  3           15
#Q6:-Selections made after winning the toss?

# Load necessary library
library(dplyr)

# Calculate toss decision percentages
toss_decision_counts <- table(ipl$toss_decision)
toss_decision_percentages <- prop.table(toss_decision_counts) * 100

# Define colors
colors <- c("violet", "red")

# Labels with percentage
labels <- paste(names(toss_decision_percentages), 
                sprintf("%.1f%%", toss_decision_percentages))

# Create a pie chart
pie(toss_decision_percentages, labels = labels,
    col = colors, main = "Toss Decision Percentage",
    init.angle = 90, clockwise = TRUE)

#Q7:-As we have seen above the toss decision does it impact the match result ?

# Calculate the number of wins and losses when batting second
no_of_wins <- sum(ipl$win_by_wickets > 0)
no_of_loss <- sum(ipl$win_by_wickets == 0)

# Labels for the pie chart
labels <- c("Wins", "Loss")

# Total matches calculated for percentage computation
total <- no_of_wins + no_of_loss
sizes <- c((no_of_wins / total) * 100, (no_of_loss / total) * 100)

# Define colors for the pie slices
colors <- c('red', 'green')

# Create labels with percentages for the pie chart
percentage_labels <- sprintf("%s %.1f%%", labels, sizes)

# Create the pie chart
pie(sizes, labels = percentage_labels, col = colors, main = "Win Percentage Batting Second",
    init.angle = 90, clockwise = TRUE)

#Q8:-maximum toss winners?

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Prepare the data
toss_winner_counts <- ipl %>%
  count(toss_winner) %>%
  arrange(desc(n))  # Sorting by count to display the most frequent toss winners at the top

# Plotting the data using ggplot2
ggplot(toss_winner_counts, aes(x = n, y = reorder(toss_winner, n), fill = toss_winner)) +  # Adding fill based on toss_winner
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set1") +  # Using a ColorBrewer palette
  labs(title = "Frequency of Toss Wins by Team",
       x = "Count of Toss Wins",
       y = "Team") +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

#Q9:-player of the match as per the number of matches?

# Analyze the impact of Player of the Match on winning
player_of_match_influence <- ipl %>%
  mutate(is_winner = ifelse(winner == team1 | winner == team2, winner, NA)) %>%
  group_by(player_of_match) %>%
  summarise(matches_won = sum(player_of_match == is_winner, na.rm = TRUE),
            total_awards = n(),
            win_percentage = matches_won / total_awards * 100) %>%
  filter(total_awards > 10) %>%  # Consider only players with more than 10 awards for a more robust analysis
  arrange(desc(win_percentage))

# View the results
print(player_of_match_influence)
## # A tibble: 13 × 4
##    player_of_match matches_won total_awards win_percentage
##    <chr>                 <int>        <int>          <dbl>
##  1 AB de Villiers            0           15              0
##  2 AM Rahane                 0           12              0
##  3 CH Gayle                  0           18              0
##  4 DA Warner                 0           15              0
##  5 DR Smith                  0           11              0
##  6 G Gambhir                 0           13              0
##  7 MEK Hussey                0           12              0
##  8 MS Dhoni                  0           13              0
##  9 RG Sharma                 0           14              0
## 10 SK Raina                  0           14              0
## 11 V Kohli                   0           11              0
## 12 V Sehwag                  0           11              0
## 13 YK Pathan                 0           16              0
#Q10:-performance of teams won by runs ?team which bated first!!

# Load necessary library
library(dplyr)
library(ggplot2)

# Calculate the median win_by_runs for each team that won by runs, and sort the result
median_wins_by_runs <- ipl %>%
  filter(win_by_runs > 0) %>%
  group_by(winner) %>%
  summarise(median_runs = median(win_by_runs)) %>%
  arrange(desc(median_runs))
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the results
print(median_wins_by_runs)
## # A tibble: 13 × 2
##    winner                      median_runs
##    <chr>                             <dbl>
##  1 Rajasthan Royals                  1.23 
##  2 Royal Challengers Bangalore       0.996
##  3 Delhi Daredevils                  0.975
##  4 Chennai Super Kings               0.913
##  5 Deccan Chargers                   0.724
##  6 Kolkata Knight Riders             0.724
##  7 Mumbai Indians                    0.641
##  8 Kings XI Punjab                   0.557
##  9 Rising Pune Supergiant            0.557
## 10 Rising Pune Supergiants           0.536
## 11 Pune Warriors                     0.432
## 12 Sunrisers Hyderabad               0.348
## 13 Kochi Tuskers Kerala              0.139
# Filter data to include only matches won by runs
data_won_by_runs <- ipl %>%
  filter(win_by_runs > 0)

# Create a horizontal boxplot to display win by runs for each winning team
ggplot(data_won_by_runs, aes(y = winner, x = win_by_runs)) +
  geom_boxplot() +
  labs(title = "Winning by Runs - Team Performance", x = "Win by Runs", y = "Winner") +
  theme_minimal() +
  theme(axis.title.y = element_blank()) 

#Q11:-performance of team won by wickets?batting second!!

# Load necessary library
library(dplyr)
library(ggplot2)

# Calculate the median of 'win_by_wickets' for each team that won by wickets, and sort the result
median_wins_by_wickets <- ipl %>%
  filter(win_by_wickets > 0) %>%
  group_by(winner) %>%
  summarise(median_wickets = median(win_by_wickets, na.rm = TRUE)) %>%
  arrange(desc(median_wickets))

# Print the results
print(median_wins_by_wickets)
## # A tibble: 14 × 2
##    winner                      median_wickets
##    <chr>                                <dbl>
##  1 Kochi Tuskers Kerala                 1.21 
##  2 Delhi Daredevils                     1.06 
##  3 Kolkata Knight Riders                1.06 
##  4 Rajasthan Royals                     1.06 
##  5 Rising Pune Supergiants              1.06 
##  6 Royal Challengers Bangalore          1.06 
##  7 Sunrisers Hyderabad                  1.06 
##  8 Chennai Super Kings                  0.914
##  9 Pune Warriors                        0.914
## 10 Deccan Chargers                      0.768
## 11 Gujarat Lions                        0.768
## 12 Kings XI Punjab                      0.768
## 13 Mumbai Indians                       0.768
## 14 Rising Pune Supergiant               0.768
# Filter data to include only matches won by wickets
data_won_by_wickets <- ipl %>%
  filter(win_by_wickets > 0)

ggplot(data_won_by_wickets, aes(y = winner, x = win_by_wickets, fill = winner)) +
  geom_boxplot() +
  labs(title = "Winning by Wickets - Team Performance", x = "Win by Wickets", y = "Winner") +
  theme_minimal() +
  theme(axis.title.y = element_blank(), 
        axis.text.y = element_text(size = 7),  # Ensure text size is readable
        legend.position = "none")

#Q12:-What is the correlation between winning the toss and choosing to bat or bowl?using corrplot?

# Load necessary libraries
library(dplyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
ipl_data <- ipl %>%
  mutate(toss_winner_is_match_winner = as.integer(toss_winner == winner),  # 1 if toss winner wins the game, else 0
         choose_to_bat = as.integer(toss_decision == "bat"))  # 1 if decision is to bat, 0 if bowl

# Calculate the correlation between winning the toss and choosing to bat
correlation_result <- cor(ipl_data$toss_winner_is_match_winner, ipl_data$choose_to_bat, method = "pearson", use = "complete.obs")

# Print the correlation result
print(correlation_result)
## [1] -0.09993806
correlation_matrix <- cor(ipl_data[, c("toss_winner_is_match_winner", "choose_to_bat")], use = "complete.obs")

# Visualizing the correlation matrix using corrplot
corrplot(correlation_matrix, method = "circle", type = "upper", 
         title = "Correlation between Toss Outcome and Match Decisions", 
         tl.col = "black", tl.cex = 0.8)