analysis on IPL 2023 auction data set.(source: kaggle, date:10-10-2023)

There are a total of 568 players, including players retained by their respective squads.

# Load the required packages
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Loading the dataset 
df <- read.csv("C:/Users/abhishek/OneDrive/Desktop/ipl_2023_dataset.csv")

# Renaming the columns for easy access
colnames(df) <- c('name', 'base_price', 'type', 'cost_rs', 'cost_usd', '2022_s', '2023_s')
# Displaying the first few rows of the DataFrame
head(df)
##              name base_price         type cost_rs cost_usd 2022_s 2023_s
## 1     Shivam Mavi    4000000       BOWLER     6.0      720    KKR     GT
## 2   Joshua Little    5000000       BOWLER     4.4      528            GT
## 3 Kane Williamson   20000000      BATSMAN     2.0      240    SRH     GT
## 4     K.S. Bharat    2000000 WICKETKEEPER     1.2      144     DC     GT
## 5    Mohit Sharma    5000000       BOWLER     0.5       60            GT
## 6     Odean Smith    5000000  ALL-ROUNDER     0.5       60   PBKS     GT
# Getting the shape of the DataSet
dim(df)
## [1] 568   7
# Summary of the data
summary(df)
##      name            base_price            type              cost_rs       
##  Length:568         Length:568         Length:568         Min.   : 0.0000  
##  Class :character   Class :character   Class :character   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 0.0000  
##                                                           Mean   : 0.6872  
##                                                           3rd Qu.: 0.2000  
##                                                           Max.   :18.5000  
##                                                           NA's   :325      
##     cost_usd          2022_s             2023_s         
##  Min.   :   0.00   Length:568         Length:568        
##  1st Qu.:   0.00   Class :character   Class :character  
##  Median :   0.00   Mode  :character   Mode  :character  
##  Mean   :  82.47                                        
##  3rd Qu.:  24.00                                        
##  Max.   :2220.00                                        
##  NA's   :325
# Checking the information of the DataFrame
str(df)
## 'data.frame':    568 obs. of  7 variables:
##  $ name      : chr  "Shivam Mavi" "Joshua Little" "Kane Williamson" "K.S. Bharat" ...
##  $ base_price: chr  "4000000" "5000000" "20000000" "2000000" ...
##  $ type      : chr  "BOWLER" "BOWLER" "BATSMAN" "WICKETKEEPER" ...
##  $ cost_rs   : num  6 4.4 2 1.2 0.5 0.5 0.2 0 0 0 ...
##  $ cost_usd  : int  720 528 240 144 60 60 24 0 0 0 ...
##  $ 2022_s    : chr  "KKR" "" "SRH" "DC" ...
##  $ 2023_s    : chr  "GT" "GT" "GT" "GT" ...
# Checking for missing values
colSums(is.na(df))
##       name base_price       type    cost_rs   cost_usd     2022_s     2023_s 
##          0          0          0        325        325          0          0

There are null values present in the data which implies the players who got unsold.

# Filtering rows where 2023_s is 'Unsold' and cost_rs is NA
subset_df <- df[is.na(df$cost_rs) & df$'2023_s' == 'Unsold', ]
# Getting the number of unique values in each column
sapply(df, function(x) length(unique(x)))
##       name base_price       type    cost_rs   cost_usd     2022_s     2023_s 
##        565          9          4         33         33         11         11
# Checking for duplicated rows
sum(duplicated(df))
## [1] 0

No duplicate values in this dataset.

# Counting the occurrences of each value in the base_price column
table(df$base_price)
## 
## 10000000 15000000  2000000 20000000  3000000  4000000  5000000  7500000 
##       20       11      274       19        4        7       61        9 
## Retained 
##      163

Maximum players have either been retained or have a base price of 2000000/-.

# Converting "Retained" to 0 in the base_price column
df$base_price <- ifelse(df$base_price == 'Retained', 0, as.numeric(df$base_price))
## Warning in ifelse(df$base_price == "Retained", 0, as.numeric(df$base_price)):
## NAs introduced by coercion
# Converting base_price to Crores
df$base_price <- df$base_price / 10000000
# Counting the occurrences of each value in the base_price column
table(df$base_price)
## 
##    0  0.2  0.3  0.4  0.5 0.75    1  1.5    2 
##  163  274    4    7   61    9   20   11   19
# Plotting a bar chart for base_price
barplot(table(df$base_price), xlab = 'Base Price')

This implies the above result.

# Counting the occurrences of each value in the type column
table(df$type)
## 
##  ALL-ROUNDER      BATSMAN       BOWLER WICKETKEEPER 
##          213           91          189           75

More players who were in the auction or retained are either all-rounders or bowlers.

# Plotting a histogram for cost_rs
hist(df$cost_rs, breaks = 18, col = "skyblue", border = 'blue')

More players were sold for less than 1cr rupees.

# Dropping the cost_usd column
df <- df[, -which(names(df) == 'cost_usd')]
# Checking the squads for 2022
table(df$'2022_s')
## 
##       CSK   DC   GT  KKR  LSG   MI PBKS  RCB   RR  SRH 
##  338   24   23   22   22   21   28   24   20   24   22

Apart from the players who were in the squads, the remaining players have came into this auction because either they were not picked in the 2022 auction or new entrants.

# Checking the squads for 2023
table(df$'2023_s')
## 
##    CSK     DC     GT    KKR    LSG     MI   PBKS    RCB     RR    SRH Unsold 
##     25     25     25     22     25     24     22     25     25     25    325

More players were unsold than the players in squads.

# Counting the players for 2023 and 2022
players_2023 <- sum(!is.na(df[df$'2023_s' != 'Unsold', '2023_s']))
players_2022 <- sum(df$'2022_s' != '')
paste('Year 2023 has', players_2023, 'players and year 2022 had', players_2022, 'players')
## [1] "Year 2023 has 243 players and year 2022 had 230 players"
# Analyzing base price by player type
# Total sum per player type
aggregate(df$base_price, by = list(type = df$type), sum)
##           type     x
## 1  ALL-ROUNDER 61.15
## 2      BATSMAN 33.90
## 3       BOWLER 53.75
## 4 WICKETKEEPER 21.75
# Average base price per player type
aggregate(df$base_price, by = list(type = df$type), mean)
##           type         x
## 1  ALL-ROUNDER 0.2870892
## 2      BATSMAN 0.3725275
## 3       BOWLER 0.2843915
## 4 WICKETKEEPER 0.2900000

Since more players were all-rounders or bowlers, hence they have highest cumulative base price, but have lower mean base price than batsmen and keepers.

# Standard deviation of base price per player type
aggregate(df$base_price, by = list(type = df$type),sd)
##           type         x
## 1  ALL-ROUNDER 0.4049050
## 2      BATSMAN 0.5381173
## 3       BOWLER 0.3883370
## 4 WICKETKEEPER 0.4119007
# Coefficient of variation of base price per player type
aggregate(df$base_price, by = list(type = df$type),function(x) sd(x) / mean(x))
##           type        x
## 1  ALL-ROUNDER 1.410381
## 2      BATSMAN 1.444504
## 3       BOWLER 1.365501
## 4 WICKETKEEPER 1.420347

Batsmen and keeper’s base prices are highely spread out, due to their higher standard deviation and coefficient of variation.

# Analyzing cost price by player type for year 2023
# Total sum cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),sum)
##           type  x
## 1  ALL-ROUNDER NA
## 2      BATSMAN NA
## 3       BOWLER NA
## 4 WICKETKEEPER NA

All-rounders were in high demand in this auction, which is reflected in their sold prices.

# Average cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),mean)
##           type  x
## 1  ALL-ROUNDER NA
## 2      BATSMAN NA
## 3       BOWLER NA
## 4 WICKETKEEPER NA

But batsmen and keepers have higher mean sold prices, due to their less strength.

# Standard deviation of cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),sd)
##           type  x
## 1  ALL-ROUNDER NA
## 2      BATSMAN NA
## 3       BOWLER NA
## 4 WICKETKEEPER NA
# Coefficient of variation of cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),function(x) sd(x) / mean(x))
##           type  x
## 1  ALL-ROUNDER NA
## 2      BATSMAN NA
## 3       BOWLER NA
## 4 WICKETKEEPER NA

All-rounders and keeper’s sold prices have spread out greatly, due to their higher standard deviation and coefficient of variation.

library(dplyr)
# Convert 'base_price' and 'cost_rs' to numeric
df$base_price <- as.numeric(df$base_price)
df$cost_rs <- as.numeric(df$cost_rs)

# Converting "Retained" to 0 in the base_price column
df$base_price <- ifelse(df$base_price == 'Retained', 0, as.numeric(df$base_price))
# Converting base_price to Crores
df$base_price <- df$base_price / 10000000

# Get a 5-number summary by player type
summary_by_type <- df %>%
  group_by(type) %>%
  summarise(
    count = n(),
    min_cost_rs = min(cost_rs, na.rm = TRUE),
    q1_cost_rs = quantile(cost_rs, 0.25, na.rm = TRUE),
    median_cost_rs = median(cost_rs, na.rm = TRUE),
    q3_cost_rs = quantile(cost_rs, 0.75, na.rm = TRUE),
    max_cost_rs = max(cost_rs, na.rm = TRUE)
  )

# Print the summary
print(summary_by_type)
## # A tibble: 4 × 7
##   type        count min_cost_rs q1_cost_rs median_cost_rs q3_cost_rs max_cost_rs
##   <chr>       <int>       <dbl>      <dbl>          <dbl>      <dbl>       <dbl>
## 1 ALL-ROUNDER   213           0          0              0      0.2          18.5
## 2 BATSMAN        91           0          0              0      0.2          13.2
## 3 BOWLER        189           0          0              0      0.2           6  
## 4 WICKETKEEP…    75           0          0              0      0.238        16

All-rounders were sold for a higher price and bowlers for lower prices.

# Create boxplot for 'cost_rs' by player type
df %>%
  ggplot(aes(x = type, y = cost_rs, fill = type)) +
  geom_boxplot() +
  labs(title = "Boxplot of Cost by Player Type") +
  theme_minimal()
## Warning: Removed 325 rows containing non-finite values (`stat_boxplot()`).

There are oultiers in each player type but we can’t remove these outliers because these all represent actual sold prices.