With the growth in Data Science in the field of Sports, It is essential to study and understand the historical data of Cricket Players. This Analysis helped us to erase some of the myths in the Cricket World. Following are several Insights analyzed and Visualized.
Below are the data sources used for this Project.
Source Data - Data is primarily sourced from CSV file downloaded from Kaggle. It contains Player’s every Innings played in Test and ODI Cricket games.
Reference Data - Rating Information sourced from Wikipedia and Latitude/Longitude Information from Google. CSV files are loaded to MOngoDB and housed in the MongoDB collections. This data is combined with the Source data for analysis and reporting.
Traditional approach of Data Science workflow such as Data Ingestion, Data Transformation, Data Analysis and Data Visualization are performed.
Below are the libraries referenced.
viridis leaflet dplyr plotly hrbrthemes tidyverse dplyr mongolite data.table ggplot2 ggmap RColorBrewer knitr ggplot2
## Loading required package: viridisLite
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see http://bit.ly/arialnarrow
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
#Main Inputs
test_input <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Final%20Project/Test%20Player%20Innings%20Stats%20-%20All%20Teams.csv",sep = ",", stringsAsFactors = F))
odi_input <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Final%20Project/ODI%20Player%20Innings%20Stats%20-%20All%20Teams.csv",sep = ",", stringsAsFactors = F))
bat_Ratings <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Final%20Project/batsmen_ratings_all091217.csv",sep = ",", stringsAsFactors = F))
bowl_Ratings <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Final%20Project/bowler_ratings_all091217.csv",sep = ",", stringsAsFactors = F))
location <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Final%20Project/latitude.csv",sep = ",", stringsAsFactors = F))
#Reference Data
mongodb_bat <- mongo(collection = "BatsmanRating", db = "Rating", url = "mongodb://localhost",verbose = FALSE, options = ssl_options())
mongodb_bowl <- mongo(collection = "BowlerRating", db = "Rating", url = "mongodb://localhost",verbose = FALSE, options = ssl_options())
mongodb_loc <- mongo(collection = "locationCoord", db = "Country", url = "mongodb://localhost",verbose = FALSE, options = ssl_options())
if(mongodb_bat$count() > 0) mongodb_bat$drop()
if(mongodb_bowl$count() > 0) mongodb_bowl$drop()
if(mongodb_loc$count() > 0) mongodb_loc$drop()
mongodb_bat$insert(bat_Ratings,stop_on_error = TRUE)
## List of 5
## $ nInserted : num 189233
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
mongodb_bowl$insert(bowl_Ratings,stop_on_error = TRUE)
## List of 5
## $ nInserted : num 189233
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
mongodb_loc$insert(location,stop_on_error = TRUE)
## List of 5
## $ nInserted : num 245
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
bat_ratings_ds <- mongodb_bat$find()
bowl_ratings_ds <- mongodb_bowl$find()
location_ds <- mongodb_loc$find()
bat_ratings_ds <- separate(bat_ratings_ds,rate_date,into=c("rate_year","rate_mon","rate_day"),sep="-")
bowl_ratings_ds <- separate(bowl_ratings_ds,rate_date,into=c("rate_year","rate_mon","rate_day"),sep="-")
names(bat_ratings_ds)[1] <- paste("Innings.Player")
names(bowl_ratings_ds)[1] <- paste("Innings.Player")
#Tidy Source Data
# Rename Columns
names(test_input)[1] <- paste("Innings.Player")
names(odi_input)[1] <- paste("Innings.Player")
test_input_subset <- test_input %>%
select(Innings.Player,Innings.Runs.Scored.Num,Innings.Date,Ground,Innings.Number,X100.s,X50.s,Innings.Minutes.Batted,Innings.Wickets.Taken,Opposition,Country,Innings.Overs.Bowled) %>%
mutate(Cricket_type = "Test") %>%
mutate_if(is.character, str_trim)
test_input_subset[is.na(test_input_subset)] = 0
test_input_subset$Innings.Runs.Scored.Num <- suppressWarnings(as.numeric(test_input_subset$Innings.Runs.Scored.Num))
test_input_subset$Innings.Minutes.Batted <- suppressWarnings(as.numeric(test_input_subset$Innings.Minutes.Batted))
odi_input_subset <- odi_input %>%
select(Innings.Player,Innings.Runs.Scored.Num,Innings.Date,Ground,Innings.Number,X100.s,X50.s,Innings.Minutes.Batted,Innings.Wickets.Taken,Opposition,Country,Innings.Overs.Bowled) %>%
mutate(Cricket_type = "ODI") %>%
mutate_if(is.character, str_trim)
odi_input_subset[is.na(odi_input_subset)] = 0
odi_input_subset$Innings.Runs.Scored.Num <- suppressWarnings(as.numeric(odi_input_subset$Innings.Runs.Scored.Num,0))
odi_input_subset$Innings.Minutes.Batted <- suppressWarnings(as.numeric(odi_input_subset$Innings.Minutes.Batted))
names(test_input_subset)[5] <- paste("Centuries")
names(test_input_subset)[6] <- paste("Half Centuries")
names(odi_input_subset)[5] <- paste("Centuries")
names(odi_input_subset)[6] <- paste("Half Centuries")
rm(test_input)
rm(odi_input)
#Calculate Match Count - Test
test_input_subset_mtch_Cnt <- test_input_subset %>%
select(Innings.Player,Innings.Runs.Scored.Num,Innings.Minutes.Batted) %>%
filter(Innings.Minutes.Batted != "-" & Innings.Minutes.Batted !=0 ) %>%
group_by(Innings.Player) %>%
dplyr::summarise(mtch_cnt = n())
#Calculate Total Runs - Test
test_input_subset_total_runs <- test_input_subset %>%
select(Innings.Player,Innings.Runs.Scored.Num) %>%
group_by(Innings.Player) %>%
dplyr::summarise(total_runs = sum(Innings.Runs.Scored.Num,na.rm=TRUE))
#Calculate Match Count - ODI
odi_input_subset_mtch_Cnt <- odi_input_subset %>%
select(Innings.Player,Innings.Runs.Scored.Num,Innings.Minutes.Batted) %>%
filter(Innings.Minutes.Batted != "-" & Innings.Minutes.Batted !=0 ) %>%
group_by(Innings.Player) %>%
dplyr::summarise(mtch_cnt = n())
#Calculate Total Runs - ODI
odi_input_subset_total_runs <- odi_input_subset %>%
select(Innings.Player,Innings.Runs.Scored.Num) %>%
group_by(Innings.Player) %>%
dplyr::summarise(total_runs = sum(Innings.Runs.Scored.Num,na.rm=TRUE))
test_input_subset <- inner_join(test_input_subset, test_input_subset_mtch_Cnt, by = "Innings.Player")
test_input_subset <- inner_join(test_input_subset, test_input_subset_total_runs, by = "Innings.Player")
odi_input_subset <- inner_join(odi_input_subset, odi_input_subset_mtch_Cnt, by = "Innings.Player")
odi_input_subset <- inner_join(odi_input_subset, odi_input_subset_total_runs, by = "Innings.Player")
rm(test_input_subset_mtch_Cnt)
rm(test_input_subset_total_runs)
rm(odi_input_subset_mtch_Cnt)
rm(odi_input_subset_total_runs)
#Calculate Batting Average
test_input_subset <- test_input_subset %>%
mutate(batting_average = total_runs/(mtch_cnt) )
odi_input_subset <- odi_input_subset %>%
mutate(batting_average = total_runs/(mtch_cnt) )
test_odi_subset <- rbind(test_input_subset,odi_input_subset)
test_odi_subset <- separate(test_odi_subset,Innings.Date,into=c("year","mon","day"),sep="/")
rm(test_input_subset)
rm(odi_input_subset)
#Tidy Reference Data
bat_ratings_rank <- bat_ratings_ds %>%
select(Innings.Player,rank,rate_year,rate_mon)
bat_ratings_rank <- bat_ratings_rank %>%
select(Innings.Player,rank,rate_year,rate_mon) %>%
filter(rank == 1)
bowl_ratings_rank <- bowl_ratings_ds %>%
select(Innings.Player,rank,rate_year,rate_mon)
bowl_ratings_rank <- bowl_ratings_rank %>%
select(Innings.Player,rank,rate_year,rate_mon) %>%
filter(rank == 1)
rm(bat_ratings_ds)
rm(bowl_ratings_ds)
player_ratings <- rbind(bat_ratings_rank,bowl_ratings_rank)
rm(bat_ratings_rank)
rm(bowl_ratings_rank)
player_ratings$corrected_name <- str_extract(player_ratings$Innings.Player, " [:alpha:]{2,}")
test_odi_subset$corrected_name <- str_extract(test_odi_subset$Innings.Player, " [:alpha:]{2,}")
bat_ratings_rank_player <- inner_join(player_ratings,test_odi_subset,by="corrected_name")
test_odi_subset$Opposition_corr <- str_replace_all(test_odi_subset$Opposition,"[v]","")
rm(player_ratings)
#Most Runs in Grounds
ggplot(most_runs_ground_test, aes(x=Ground, y=total_Runs_gr)) +
geom_bar(stat="identity",fill=alpha("Red", 0.7) ) + coord_polar(theta = "x") + theme_minimal() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),plot.margin = unit(rep(-1,4), "cm") ,
plot.title = element_text(hjust = 0.5, color = "#6632666")) + theme_light() + xlab("Ground") +
ylab("Total Runs")
ggplot(most_runs_ground_odi, aes(x=Ground, y=total_Runs_gr)) +
geom_bar(stat="identity",fill=alpha("Green", 0.7) ) + coord_polar(theta = "x") + theme_minimal() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),plot.margin = unit(rep(-1,4), "cm") ,
plot.title = element_text(hjust = 0.5, color = "#6632666")) + theme_light()+ xlab("Ground") +
ylab("Total Runs")
p<-ggplot(bat_perf_bat_avg, aes(x=as.Date(paste0(bat_perf_bat_avg$year,"/01","/01")), y=avg_batting)) +
geom_segment( aes(x=as.Date(paste0(bat_perf_bat_avg$year,"/01","/01")), xend=as.Date(paste0(bat_perf_bat_avg$year,"/01","/01")), y=0, yend=avg_batting), color="grey") +
geom_point( color="blue", size=4) +
theme_bw() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),plot.margin = unit(rep(-1,4), "cm") ,
plot.title = element_text(hjust = 0.5, color = "#6632666")) + xlab("Year") +
ylab("Batting Average")
ggplotly(p)
## NULL
p<-ggplot(bowl_perf_wickets, aes(x=as.Date(paste0(bowl_perf_wickets$year,"/01","/01")), y=total_wickets)) +
geom_area(fill="#FFFFFF", alpha=0.5) +theme(axis.line = element_blank(),
axis.ticks = element_blank(),plot.margin = unit(rep(-1,4), "cm") ,
plot.title = element_text(hjust = 0.5, color = "#6632666")) +
geom_line(color="#04F339") + ylim(1000,4000) + xlab("Year")
ylab("Minutes Batted") + theme_bw()
## NULL
ggplotly(p)
# Player Rank Snapshot
p <- ggplot(bat_ratings_rank_player_domin, aes(x=rate_year, y=Innings.Player.x,size=10,color="#F004F3")) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 20), name="Player with Most ratings") +
theme(legend.position="none") + theme_bw()+ xlab("Year") +
ylab("Player Name")
pp <- ggplotly(p)
pp
wmap <- borders("world", colour = "black", fill = "gray")
m<-ggplot() + wmap +
geom_point(data = sachin_scores_loc, aes(x = as.numeric(longitude),y = as.numeric(latitude), colour = total_runs,size=10000)) + ggtitle("Sachin's Runs Across World") +
xlab("Longitude") + ylab("Latitude") + scale_size(range = c(0, 25)) +
theme(plot.title = element_text(hjust = 0.5)) +theme_minimal()
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc1 <- scale_color_gradient2( low="blue", mid="green",
high="red", space ="Lab" )
m<-m+sc1
m
The Player Names across the Source and Reference data was not same, Some tidying was required to get the clean names out of both Data Source.
Joining Player Rating with the actual Players dataframe