analysis on IPL 2023 auction data set.(source: kaggle, date:10-10-2023)
There are a total of 568 players, including players retained by their respective squads.
# Load the required packages
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Loading the dataset
df <- read.csv("C:/Users/abhishek/OneDrive/Desktop/ipl_2023_dataset.csv")
# Renaming the columns for easy access
colnames(df) <- c('name', 'base_price', 'type', 'cost_rs', 'cost_usd', '2022_s', '2023_s')
# Displaying the first few rows of the DataFrame
head(df)
## name base_price type cost_rs cost_usd 2022_s 2023_s
## 1 Shivam Mavi 4000000 BOWLER 6.0 720 KKR GT
## 2 Joshua Little 5000000 BOWLER 4.4 528 GT
## 3 Kane Williamson 20000000 BATSMAN 2.0 240 SRH GT
## 4 K.S. Bharat 2000000 WICKETKEEPER 1.2 144 DC GT
## 5 Mohit Sharma 5000000 BOWLER 0.5 60 GT
## 6 Odean Smith 5000000 ALL-ROUNDER 0.5 60 PBKS GT
# Getting the shape of the DataSet
dim(df)
## [1] 568 7
# Summary of the data
summary(df)
## name base_price type cost_rs
## Length:568 Length:568 Length:568 Min. : 0.0000
## Class :character Class :character Class :character 1st Qu.: 0.0000
## Mode :character Mode :character Mode :character Median : 0.0000
## Mean : 0.6872
## 3rd Qu.: 0.2000
## Max. :18.5000
## NA's :325
## cost_usd 2022_s 2023_s
## Min. : 0.00 Length:568 Length:568
## 1st Qu.: 0.00 Class :character Class :character
## Median : 0.00 Mode :character Mode :character
## Mean : 82.47
## 3rd Qu.: 24.00
## Max. :2220.00
## NA's :325
# Checking the information of the DataFrame
str(df)
## 'data.frame': 568 obs. of 7 variables:
## $ name : chr "Shivam Mavi" "Joshua Little" "Kane Williamson" "K.S. Bharat" ...
## $ base_price: chr "4000000" "5000000" "20000000" "2000000" ...
## $ type : chr "BOWLER" "BOWLER" "BATSMAN" "WICKETKEEPER" ...
## $ cost_rs : num 6 4.4 2 1.2 0.5 0.5 0.2 0 0 0 ...
## $ cost_usd : int 720 528 240 144 60 60 24 0 0 0 ...
## $ 2022_s : chr "KKR" "" "SRH" "DC" ...
## $ 2023_s : chr "GT" "GT" "GT" "GT" ...
# Checking for missing values
colSums(is.na(df))
## name base_price type cost_rs cost_usd 2022_s 2023_s
## 0 0 0 325 325 0 0
There are null values present in the data which implies the players who got unsold.
# Filtering rows where 2023_s is 'Unsold' and cost_rs is NA
subset_df <- df[is.na(df$cost_rs) & df$'2023_s' == 'Unsold', ]
# Getting the number of unique values in each column
sapply(df, function(x) length(unique(x)))
## name base_price type cost_rs cost_usd 2022_s 2023_s
## 565 9 4 33 33 11 11
# Checking for duplicated rows
sum(duplicated(df))
## [1] 0
No duplicate values in this dataset.
# Counting the occurrences of each value in the base_price column
table(df$base_price)
##
## 10000000 15000000 2000000 20000000 3000000 4000000 5000000 7500000
## 20 11 274 19 4 7 61 9
## Retained
## 163
Maximum players have either been retained or have a base price of 2000000/-.
# Converting "Retained" to 0 in the base_price column
df$base_price <- ifelse(df$base_price == 'Retained', 0, as.numeric(df$base_price))
## Warning in ifelse(df$base_price == "Retained", 0, as.numeric(df$base_price)):
## NAs introduced by coercion
# Converting base_price to Crores
df$base_price <- df$base_price / 10000000
# Counting the occurrences of each value in the base_price column
table(df$base_price)
##
## 0 0.2 0.3 0.4 0.5 0.75 1 1.5 2
## 163 274 4 7 61 9 20 11 19
# Plotting a bar chart for base_price
barplot(table(df$base_price), xlab = 'Base Price')
This implies the above result.
# Counting the occurrences of each value in the type column
table(df$type)
##
## ALL-ROUNDER BATSMAN BOWLER WICKETKEEPER
## 213 91 189 75
More players who were in the auction or retained are either all-rounders or bowlers.
# Plotting a histogram for cost_rs
hist(df$cost_rs, breaks = 18, col = "skyblue", border = 'blue')
More players were sold for less than 1cr rupees.
# Dropping the cost_usd column
df <- df[, -which(names(df) == 'cost_usd')]
# Checking the squads for 2022
table(df$'2022_s')
##
## CSK DC GT KKR LSG MI PBKS RCB RR SRH
## 338 24 23 22 22 21 28 24 20 24 22
Apart from the players who were in the squads, the remaining players have came into this auction because either they were not picked in the 2022 auction or new entrants.
# Checking the squads for 2023
table(df$'2023_s')
##
## CSK DC GT KKR LSG MI PBKS RCB RR SRH Unsold
## 25 25 25 22 25 24 22 25 25 25 325
More players were unsold than the players in squads.
# Counting the players for 2023 and 2022
players_2023 <- sum(!is.na(df[df$'2023_s' != 'Unsold', '2023_s']))
players_2022 <- sum(df$'2022_s' != '')
paste('Year 2023 has', players_2023, 'players and year 2022 had', players_2022, 'players')
## [1] "Year 2023 has 243 players and year 2022 had 230 players"
# Analyzing base price by player type
# Total sum per player type
aggregate(df$base_price, by = list(type = df$type), sum)
## type x
## 1 ALL-ROUNDER 61.15
## 2 BATSMAN 33.90
## 3 BOWLER 53.75
## 4 WICKETKEEPER 21.75
# Average base price per player type
aggregate(df$base_price, by = list(type = df$type), mean)
## type x
## 1 ALL-ROUNDER 0.2870892
## 2 BATSMAN 0.3725275
## 3 BOWLER 0.2843915
## 4 WICKETKEEPER 0.2900000
Since more players were all-rounders or bowlers, hence they have highest cumulative base price, but have lower mean base price than batsmen and keepers.
# Standard deviation of base price per player type
aggregate(df$base_price, by = list(type = df$type),sd)
## type x
## 1 ALL-ROUNDER 0.4049050
## 2 BATSMAN 0.5381173
## 3 BOWLER 0.3883370
## 4 WICKETKEEPER 0.4119007
# Coefficient of variation of base price per player type
aggregate(df$base_price, by = list(type = df$type),function(x) sd(x) / mean(x))
## type x
## 1 ALL-ROUNDER 1.410381
## 2 BATSMAN 1.444504
## 3 BOWLER 1.365501
## 4 WICKETKEEPER 1.420347
Batsmen and keeper’s base prices are highely spread out, due to their higher standard deviation and coefficient of variation.
# Analyzing cost price by player type for year 2023
# Total sum cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),sum)
## type x
## 1 ALL-ROUNDER NA
## 2 BATSMAN NA
## 3 BOWLER NA
## 4 WICKETKEEPER NA
All-rounders were in high demand in this auction, which is reflected in their sold prices.
# Average cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),mean)
## type x
## 1 ALL-ROUNDER NA
## 2 BATSMAN NA
## 3 BOWLER NA
## 4 WICKETKEEPER NA
But batsmen and keepers have higher mean sold prices, due to their less strength.
# Standard deviation of cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),sd)
## type x
## 1 ALL-ROUNDER NA
## 2 BATSMAN NA
## 3 BOWLER NA
## 4 WICKETKEEPER NA
# Coefficient of variation of cost per player type for year 2023
aggregate(subset_df$cost_rs, by = list(type = subset_df$type),function(x) sd(x) / mean(x))
## type x
## 1 ALL-ROUNDER NA
## 2 BATSMAN NA
## 3 BOWLER NA
## 4 WICKETKEEPER NA
All-rounders and keeper’s sold prices have spread out greatly, due to their higher standard deviation and coefficient of variation.
library(dplyr)
# Convert 'base_price' and 'cost_rs' to numeric
df$base_price <- as.numeric(df$base_price)
df$cost_rs <- as.numeric(df$cost_rs)
# Converting "Retained" to 0 in the base_price column
df$base_price <- ifelse(df$base_price == 'Retained', 0, as.numeric(df$base_price))
# Converting base_price to Crores
df$base_price <- df$base_price / 10000000
# Get a 5-number summary by player type
summary_by_type <- df %>%
group_by(type) %>%
summarise(
count = n(),
min_cost_rs = min(cost_rs, na.rm = TRUE),
q1_cost_rs = quantile(cost_rs, 0.25, na.rm = TRUE),
median_cost_rs = median(cost_rs, na.rm = TRUE),
q3_cost_rs = quantile(cost_rs, 0.75, na.rm = TRUE),
max_cost_rs = max(cost_rs, na.rm = TRUE)
)
# Print the summary
print(summary_by_type)
## # A tibble: 4 × 7
## type count min_cost_rs q1_cost_rs median_cost_rs q3_cost_rs max_cost_rs
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALL-ROUNDER 213 0 0 0 0.2 18.5
## 2 BATSMAN 91 0 0 0 0.2 13.2
## 3 BOWLER 189 0 0 0 0.2 6
## 4 WICKETKEEP… 75 0 0 0 0.238 16
All-rounders were sold for a higher price and bowlers for lower prices.
# Create boxplot for 'cost_rs' by player type
df %>%
ggplot(aes(x = type, y = cost_rs, fill = type)) +
geom_boxplot() +
labs(title = "Boxplot of Cost by Player Type") +
theme_minimal()
## Warning: Removed 325 rows containing non-finite values (`stat_boxplot()`).
There are oultiers in each player type but we can’t remove these outliers because these all represent actual sold prices.