#Load neccessary libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#level-1 data cleaning and preprocessing
#load the dataset
data<-read_excel("C:/virat kohli analysis/testcareermatchwise.xlsx")
## New names:
## • `` -> `...8`
## • `` -> `...12`
View(data)
colnames(data) <- c("Bat1", "Bat2", "Runs", "BallsFaced", "StrikeRate", "Fours", "Sixes",
"Unused", "Opposition", "Ground", "StartDate", "TestNumber")
data <- data %>% select(-Unused)
data$Opposition <- gsub("^v\\s+", "", data$Opposition)
library(lubridate)
data$StartDate <- dmy(data$StartDate)
## Warning: All formats failed to parse. No formats found.
clean_innings <- function(score) {
score <- as.character(score)
score[score %in% c("DNB", "TDNB", "absent")] <- NA
score <- gsub("\\*", "", score) # Remove not-out asterisk
return(as.numeric(score))
}
not_out_flag <- function(score) {
score <- as.character(score)
return(grepl("\\*", score))
}
data <- data %>%
mutate(
Bat1_NotOut = not_out_flag(Bat1),
Bat2_NotOut = not_out_flag(Bat2),
Bat1 = clean_innings(Bat1),
Bat2 = clean_innings(Bat2)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Bat2 = clean_innings(Bat2)`.
## Caused by warning in `clean_innings()`:
## ! NAs introduced by coercion
# Combine total runs in both innings
data <- data %>%
mutate(
TotalRuns = rowSums(cbind(Bat1, Bat2), na.rm = TRUE),
TotalNotOuts = Bat1_NotOut + Bat2_NotOut,
PlayedInnings = (!is.na(Bat1)) + (!is.na(Bat2)),
Year = year(StartDate)
)
data <- data %>%
mutate(
Runs = as.numeric(Runs),
BallsFaced = as.numeric(BallsFaced),
StrikeRate = as.numeric(StrikeRate),
Fours = as.numeric(Fours),
Sixes = as.numeric(Sixes)
)
## Warning: There were 5 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Runs = as.numeric(Runs)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.
Add placeholder for match duration
data$MatchDuration <- 5 # By default, Test match = 5 days
str(data)
## tibble [123 × 18] (S3: tbl_df/tbl/data.frame)
## $ Bat1 : num [1:123] 4 0 30 52 11 23 44 116 58 103 ...
## $ Bat2 : num [1:123] 15 27 NA 63 0 9 75 22 NA 51 ...
## $ Runs : num [1:123] 19 27 30 115 11 32 119 138 58 154 ...
## $ BallsFaced : num [1:123] 64 109 53 225 22 65 217 275 107 275 ...
## $ StrikeRate : num [1:123] 29.7 24.8 56.6 51.1 50 ...
## $ Fours : num [1:123] 3 1 2 8 1 5 15 13 8 23 ...
## $ Sixes : num [1:123] 0 1 0 1 0 0 0 1 0 1 ...
## $ Opposition : chr [1:123] "v West Indies" "v West Indies" "v West Indies" "v West Indies" ...
## $ Ground : chr [1:123] "Kingston" "Bridgetown" "Roseau" "Wankhede" ...
## $ StartDate : Date[1:123], format: NA NA ...
## $ TestNumber : chr [1:123] "Test # 1997" "Test # 1998" "Test # 1999" "Test # 2019" ...
## $ Bat1_NotOut : logi [1:123] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Bat2_NotOut : logi [1:123] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalRuns : num [1:123] 19 27 30 115 11 32 119 138 58 154 ...
## $ TotalNotOuts : int [1:123] 0 0 0 0 0 0 0 0 0 1 ...
## $ PlayedInnings: int [1:123] 2 2 1 2 2 2 2 2 1 2 ...
## $ Year : num [1:123] NA NA NA NA NA NA NA NA NA NA ...
## $ MatchDuration: num [1:123] 5 5 5 5 5 5 5 5 5 5 ...
head(data)
## # A tibble: 6 × 18
## Bat1 Bat2 Runs BallsFaced StrikeRate Fours Sixes Opposition Ground
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 4 15 19 64 29.7 3 0 v West Indies Kingston
## 2 0 27 27 109 24.8 1 1 v West Indies Bridgetown
## 3 30 NA 30 53 56.6 2 0 v West Indies Roseau
## 4 52 63 115 225 51.1 8 1 v West Indies Wankhede
## 5 11 0 11 22 50 1 0 v Australia Melbourne
## 6 23 9 32 65 49.2 5 0 v Australia Sydney
## # ℹ 9 more variables: StartDate <date>, TestNumber <chr>, Bat1_NotOut <lgl>,
## # Bat2_NotOut <lgl>, TotalRuns <dbl>, TotalNotOuts <int>,
## # PlayedInnings <int>, Year <dbl>, MatchDuration <dbl>
after performing all cleanig check the data type of columns and retrieve first 8 data using head function
###Level -2 stastical analysis of my dataset # 1. Overall career batting average (runs per innings)
career_average <- sum(data$TotalRuns, na.rm = TRUE) / sum(data$PlayedInnings, na.rm = TRUE)
this average may differ from real average because in real average for cricket matches consider not out inning then calculate
not_out_percentage <- mean(data$TotalNotOuts > 0) * 100
highest_score <- max(c(data$Bat1, data$Bat2), na.rm = TRUE)
this score was made by kohli against south africa
total_runs <- sum(data$TotalRuns, na.rm = TRUE)
in test matches virat has scored total 9230 runs
centuries <- sum(data$Bat1 >= 100, na.rm = TRUE) + sum(data$Bat2 >= 100, na.rm = TRUE)
virat has scroed total 30 test centuries
fifties <- sum(data$Bat1 >= 50 & data$Bat1 < 100, na.rm = TRUE) +
sum(data$Bat2 >= 50 & data$Bat2 < 100, na.rm = TRUE)
virat has scored 31 total fifites
avg_runs_year <- data %>%
group_by(Year) %>%
summarise(RunsInYear = sum(TotalRuns, na.rm = TRUE)) %>%
mutate(AveragePerYear = round(RunsInYear / n(), 2))
avg_opposition <- data %>%
group_by(Opposition) %>%
summarise(Average = round(sum(TotalRuns, na.rm = TRUE) / sum(PlayedInnings, na.rm = TRUE), 2),
Matches = n())
this table shows batting average of virat kohli against each teams
avg_ground <- data %>%
group_by(Ground) %>%
summarise(Average = round(sum(TotalRuns, na.rm = TRUE) / sum(PlayedInnings, na.rm = TRUE), 2),
Matches = n())
this table shows average of different grounds
mean_strike_rate <- mean(data$StrikeRate, na.rm = TRUE)
max_strike_rate <- max(data$StrikeRate, na.rm = TRUE)
print(mean_strike_rate)
## [1] 51.20223
print(max_strike_rate)
## [1] 105.55
virat mean strike rate is 51.20223 virat max strike rate in test is 105.55
###Level-3 visualization of my dataset #loading required libraries visualization
library(ggplot2)
library(dplyr)
library(tidyr)
#virat kohli runs against opponents
ggplot(data %>% group_by(Opposition) %>% summarise(TotalRuns = sum(TotalRuns, na.rm = TRUE)),
aes(x = reorder(Opposition, -TotalRuns), y = TotalRuns)) +
geom_col(fill = "tomato") +
coord_flip() +
labs(title = "Total Runs vs Each Opposition", x = "Opposition", y = "Runs")
#vira kohli batting average per oppositon
data %>%
group_by(Opposition) %>%
summarise(Average = sum(TotalRuns, na.rm = TRUE) / sum(PlayedInnings, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(Opposition, -Average), y = Average)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Batting Average vs Opposition", x = "Opposition", y = "Average")
data %>%
mutate(MatchNumber = row_number()) %>%
ggplot(aes(x = MatchNumber, y = TotalRuns)) +
geom_line(color = "blue") +
geom_smooth(se = FALSE, color = "red") +
labs(title = "Match-by-Match Total Runs", x = "Match Number", y = "Runs")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(data, aes(x = TotalRuns)) +
geom_histogram(binwidth = 10, fill = "orange", color = "black") +
labs(title = "Runs Distribution per Match", x = "Runs", y = "Frequency")
data %>%
mutate(MatchNumber = row_number()) %>%
ggplot(aes(x = MatchNumber, y = StrikeRate)) +
geom_line(color = "darkred") +
geom_point() +
labs(title = "Strike Rate Match-by-Match", x = "Match Number", y = "Strike Rate")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
#virat kohli runs by ground
data %>%
group_by(Ground) %>%
summarise(Runs = sum(TotalRuns, na.rm = TRUE)) %>%
arrange(desc(Runs)) %>%
head(10) %>%
ggplot(aes(x = reorder(Ground, -Runs), y = Runs)) +
geom_col(fill = "navy") +
coord_flip() +
labs(title = "Top 10 Grounds with Most Runs", x = "Ground", y = "Runs")
data %>%
summarise(TotalFours = sum(Fours, na.rm = TRUE),
TotalSixes = sum(Sixes, na.rm = TRUE)) %>%
pivot_longer(cols = everything(), names_to = "Type", values_to = "Count") %>%
ggplot(aes(x = Type, y = Count, fill = Type)) +
geom_col() +
labs(title = "Total Fours vs Sixes", x = "Type", y = "Count")
#virat kohli bat1 vs bat2 comparison using bar plot
data <- data %>%
mutate(Match = row_number())
data_long <- data %>%
select(Match, Bat1, Bat2) %>%
pivot_longer(cols = c(Bat1, Bat2), names_to = "Innings", values_to = "Runs")
data_long$Runs <- as.numeric(data_long$Runs)
ggplot(data_long, aes(x = Match, y = Runs, color = Innings)) +
geom_line(size = 1) +
labs(title = "Innings-wise Batting Comparison", x = "Match Number", y = "Runs Scored") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data, aes(x = Match, y = TotalRuns, fill = TotalRuns)) +
geom_col() +
scale_fill_gradient2(low = "skyblue", high = "darkred", midpoint = 100) +
geom_hline(yintercept = 50, linetype = "dashed", color = "orange") +
geom_hline(yintercept = 100, linetype = "dashed", color = "green") +
labs(title = "Test Scores with 50 & 100 Milestones", x = "Match", y = "Total Runs") +
theme_minimal()
##Runs Over Time
data <- data %>%
mutate(CumulativeRuns = cumsum(TotalRuns))
ggplot(data, aes(x = Match, y = CumulativeRuns)) +
geom_line(color = "darkgreen", size = 1) +
labs(title = "Cumulative Runs Over Test Career", x = "Match Number", y = "Cumulative Runs") +
theme_minimal()
this graph shows total runs have accumulayed across his
career