# This is the R chunk for the required packages
library(readr)
library(dplyr)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
library(outliers)
library(outliers)
library(tidyr)
library(readxl)
library(Hmisc)
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Loading required package: ggplot2
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: 㤼㸱Hmisc㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
src, summarize
The following objects are masked from 㤼㸱package:base㤼㸲:
format.pval, units
library(dplyr)
install.packages("forecast")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/thyagu/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/forecast_8.13.zip'
Content type 'application/zip' length 2366760 bytes (2.3 MB)
downloaded 2.3 MB
package ‘forecast’ successfully unpacked and MD5 sums checked
Warning in install.packages :
cannot remove prior installation of package ‘forecast’
Warning in install.packages :
problem copying C:\Users\thyagu\Documents\R\win-library\3.6\00LOCK\forecast\libs\x64\forecast.dll to C:\Users\thyagu\Documents\R\win-library\3.6\forecast\libs\x64\forecast.dll: Permission denied
Warning in install.packages :
restored ‘forecast’
The downloaded binary packages are in
C:\Users\thyagu\AppData\Local\Temp\Rtmpmu9eqA\downloaded_packages
library(forecast)
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
The aim of this task is to wrangle a real world dataset using tidyR principles,using all data wrangling tasks starting from preprocessing,finding outliers,imputing missing values and to transform skewed data The data is a collection of IPL related stats from 2008-2019(IPL is a league of cricket )and our primary aim here is to perform data wrangling tasks to this dataset (URL:https://www.kaggle.com/ramjidoolla/ipl-data-set) 1) Two datasets were taken and merged as required 2)The dataset’s goal is to predict the winner but for us the main task is to wrangle the data and get ready for modelling 3)Untidying the tidy dataset in hand 4)Mutate a couple of variables from the given data (strike rate and extras per match) and focus on those variables 5)Scan and handle missing values 6)Scan for outliers 7)Tranbsform skewed data if any
Even though the URL has 6 files ,we focus only on 2 files namely matches.csv and deliveries.csv 1) Matches.csv contains every match details 2)deliveries.csv contains ball by ball details of every match from 2008-2019 Since we are focussing mostly on finding new insights like strike rate and extras per match by mutating,I have taken only these 2 files from the URL 3)Use full join since we need both files data to explore and wrangle 4)179078 records and 38 variables in total after merging
# Importing matches.csv data
match_data <- read_csv("C:/Users/thyagu/Desktop/Final ipl/matches.csv")
Parsed with column specification:
cols(
id = [32mcol_double()[39m,
Season = [31mcol_character()[39m,
city = [31mcol_character()[39m,
date = [31mcol_character()[39m,
team1 = [31mcol_character()[39m,
team2 = [31mcol_character()[39m,
toss_winner = [31mcol_character()[39m,
toss_decision = [31mcol_character()[39m,
result = [31mcol_character()[39m,
dl_applied = [32mcol_double()[39m,
winner = [31mcol_character()[39m,
win_by_runs = [32mcol_double()[39m,
win_by_wickets = [32mcol_double()[39m,
player_of_match = [31mcol_character()[39m,
venue = [31mcol_character()[39m,
umpire1 = [31mcol_character()[39m,
umpire2 = [31mcol_character()[39m,
umpire3 = [31mcol_character()[39m
)
#Reading deliveries dataset:
deliveries_data <- read_csv("C:/Users/thyagu/Desktop/Final ipl/deliveries.csv")
Parsed with column specification:
cols(
.default = col_double(),
batting_team = [31mcol_character()[39m,
bowling_team = [31mcol_character()[39m,
batsman = [31mcol_character()[39m,
non_striker = [31mcol_character()[39m,
bowler = [31mcol_character()[39m,
player_dismissed = [31mcol_character()[39m,
dismissal_kind = [31mcol_character()[39m,
fielder = [31mcol_character()[39m
)
See spec(...) for full column specifications.
#Merging datasets
#To merge datasets,the key should be the same. Here the key is match_id which is given as id in our match_data. We will rename this and then merge
names(match_data)[names(match_data) == "id"] <- "match_id"
joined_data <- full_join(match_data,deliveries_data,by = "match_id")
head(joined_data)
NA
Main task here is to get full understanding of the data and datatypes using glimpse and converting the data types wherever necessary
#getting a glimpse of merged data
glimpse(joined_data)
Observations: 179,078
Variables: 38
$ match_id [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ Season [3m[38;5;246m<chr>[39m[23m "IPL-2017", "IPL-2017", "IPL-2017", "IPL-2017...
$ city [3m[38;5;246m<chr>[39m[23m "Hyderabad", "Hyderabad", "Hyderabad", "Hyder...
$ date [3m[38;5;246m<chr>[39m[23m "05-04-2017", "05-04-2017", "05-04-2017", "05...
$ team1 [3m[38;5;246m<chr>[39m[23m "Sunrisers Hyderabad", "Sunrisers Hyderabad",...
$ team2 [3m[38;5;246m<chr>[39m[23m "Royal Challengers Bangalore", "Royal Challen...
$ toss_winner [3m[38;5;246m<chr>[39m[23m "Royal Challengers Bangalore", "Royal Challen...
$ toss_decision [3m[38;5;246m<chr>[39m[23m "field", "field", "field", "field", "field", ...
$ result [3m[38;5;246m<chr>[39m[23m "normal", "normal", "normal", "normal", "norm...
$ dl_applied [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ winner [3m[38;5;246m<chr>[39m[23m "Sunrisers Hyderabad", "Sunrisers Hyderabad",...
$ win_by_runs [3m[38;5;246m<dbl>[39m[23m 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 3...
$ win_by_wickets [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ player_of_match [3m[38;5;246m<chr>[39m[23m "Yuvraj Singh", "Yuvraj Singh", "Yuvraj Singh...
$ venue [3m[38;5;246m<chr>[39m[23m "Rajiv Gandhi International Stadium, Uppal", ...
$ umpire1 [3m[38;5;246m<chr>[39m[23m "AY Dandekar", "AY Dandekar", "AY Dandekar", ...
$ umpire2 [3m[38;5;246m<chr>[39m[23m "NJ Llong", "NJ Llong", "NJ Llong", "NJ Llong...
$ umpire3 [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ inning [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ batting_team [3m[38;5;246m<chr>[39m[23m "Sunrisers Hyderabad", "Sunrisers Hyderabad",...
$ bowling_team [3m[38;5;246m<chr>[39m[23m "Royal Challengers Bangalore", "Royal Challen...
$ over [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, ...
$ ball [3m[38;5;246m<dbl>[39m[23m 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, ...
$ batsman [3m[38;5;246m<chr>[39m[23m "DA Warner", "DA Warner", "DA Warner", "DA Wa...
$ non_striker [3m[38;5;246m<chr>[39m[23m "S Dhawan", "S Dhawan", "S Dhawan", "S Dhawan...
$ bowler [3m[38;5;246m<chr>[39m[23m "TS Mills", "TS Mills", "TS Mills", "TS Mills...
$ is_super_over [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ wide_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ bye_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ legbye_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ noball_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
$ penalty_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ batsman_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 4, 0, 0, 0, 0, 1, 4, 0, 6, 0, 0, 4, 1, ...
$ extra_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...
$ total_runs [3m[38;5;246m<dbl>[39m[23m 0, 0, 4, 0, 2, 0, 1, 1, 4, 1, 6, 0, 0, 4, 1, ...
$ player_dismissed [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "...
$ dismissal_kind [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "...
$ fielder [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "...
#Changing Data types wherever needed
#Factors:
#Factorising
joined_data$Season <- joined_data$Season %>% factor(levels = c("IPL-2008",
"IPL-2009","IPL-2010","IPL-2011","IPL-2012", "IPL-2013", "IPL-2014", "IPL-2015","IPL-2016", "IPL-2017", "IPL-2018", "IPL-2019"),labels = c("IPL-2008", "IPL-2009", "IPL-2010", "IPL-2011","IPL-2012", "IPL-2013", "IPL-2014", "IPL-2015","IPL-2016", "IPL-2017", "IPL-2018", "IPL-2019"),ordered = TRUE)
joined_data$team1 <- joined_data$team1 %>% as.factor()
joined_data$team2 <- joined_data$team2 %>% as.factor()
joined_data$toss_winner <- joined_data$toss_winner %>% as.factor()
joined_data$winner <- joined_data$winner %>% as.factor()
joined_data$toss_decision <- joined_data$toss_decision %>% as.factor()
joined_data$result <- joined_data$result %>% as.factor()
joined_data$dl_applied <- joined_data$dl_applied %>% as.factor()
joined_data$over <- joined_data$over %>% as.factor()
joined_data$is_super_over <- joined_data$is_super_over %>% as.factor()
joined_data$dismissal_kind <- joined_data$dismissal_kind %>% as.factor()
joined_data$city <- joined_data$city %>% as.factor()
#as.Date to change to Date format
joined_data$date <- as.Date(joined_data$date)
head(joined_data)
#Checking structure of data
str(joined_data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 179078 obs. of 38 variables:
$ match_id : num 1 1 1 1 1 1 1 1 1 1 ...
$ Season : Ord.factor w/ 12 levels "IPL-2008"<"IPL-2009"<..: 10 10 10 10 10 10 10 10 10 10 ...
$ city : Factor w/ 32 levels "Abu Dhabi","Ahmedabad",..: 15 15 15 15 15 15 15 15 15 15 ...
$ date : Date, format: "0005-04-20" "0005-04-20" ...
$ team1 : Factor w/ 15 levels "Chennai Super Kings",..: 15 15 15 15 15 15 15 15 15 15 ...
$ team2 : Factor w/ 15 levels "Chennai Super Kings",..: 14 14 14 14 14 14 14 14 14 14 ...
$ toss_winner : Factor w/ 15 levels "Chennai Super Kings",..: 14 14 14 14 14 14 14 14 14 14 ...
$ toss_decision : Factor w/ 2 levels "bat","field": 2 2 2 2 2 2 2 2 2 2 ...
$ result : Factor w/ 3 levels "no result","normal",..: 2 2 2 2 2 2 2 2 2 2 ...
$ dl_applied : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ winner : Factor w/ 15 levels "Chennai Super Kings",..: 15 15 15 15 15 15 15 15 15 15 ...
$ win_by_runs : num 35 35 35 35 35 35 35 35 35 35 ...
$ win_by_wickets : num 0 0 0 0 0 0 0 0 0 0 ...
$ player_of_match : chr "Yuvraj Singh" "Yuvraj Singh" "Yuvraj Singh" "Yuvraj Singh" ...
$ venue : chr "Rajiv Gandhi International Stadium, Uppal" "Rajiv Gandhi International Stadium, Uppal" "Rajiv Gandhi International Stadium, Uppal" "Rajiv Gandhi International Stadium, Uppal" ...
$ umpire1 : chr "AY Dandekar" "AY Dandekar" "AY Dandekar" "AY Dandekar" ...
$ umpire2 : chr "NJ Llong" "NJ Llong" "NJ Llong" "NJ Llong" ...
$ umpire3 : chr NA NA NA NA ...
$ inning : num 1 1 1 1 1 1 1 1 1 1 ...
$ batting_team : chr "Sunrisers Hyderabad" "Sunrisers Hyderabad" "Sunrisers Hyderabad" "Sunrisers Hyderabad" ...
$ bowling_team : chr "Royal Challengers Bangalore" "Royal Challengers Bangalore" "Royal Challengers Bangalore" "Royal Challengers Bangalore" ...
$ over : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 2 2 2 ...
$ ball : num 1 2 3 4 5 6 7 1 2 3 ...
$ batsman : chr "DA Warner" "DA Warner" "DA Warner" "DA Warner" ...
$ non_striker : chr "S Dhawan" "S Dhawan" "S Dhawan" "S Dhawan" ...
$ bowler : chr "TS Mills" "TS Mills" "TS Mills" "TS Mills" ...
$ is_super_over : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ wide_runs : num 0 0 0 0 2 0 0 0 0 0 ...
$ bye_runs : num 0 0 0 0 0 0 0 0 0 0 ...
$ legbye_runs : num 0 0 0 0 0 0 1 0 0 0 ...
$ noball_runs : num 0 0 0 0 0 0 0 0 0 1 ...
$ penalty_runs : num 0 0 0 0 0 0 0 0 0 0 ...
$ batsman_runs : num 0 0 4 0 0 0 0 1 4 0 ...
$ extra_runs : num 0 0 0 0 2 0 1 0 0 1 ...
$ total_runs : num 0 0 4 0 2 0 1 1 4 1 ...
$ player_dismissed: chr NA NA NA NA ...
$ dismissal_kind : Factor w/ 9 levels "bowled","caught",..: NA NA NA NA NA NA NA NA NA NA ...
$ fielder : chr NA NA NA NA ...
#```{r}
# This is the R chunk for the Tidy & Manipulate Data II
Here we check for tidyR principles .Redundant variable names, datatypes conversion,ordering of data, making sure each variable has its own row are all part of tidying. The win_by_runs and win_by_wickets columns were tidied up using gather because the team on a particular match day can win only by runs or by wickets,cant be both. So i split using gather function
joined_data <- joined_data %>% gather(win_by_runs, win_by_wickets, key = "win_by",value = "count")
head(joined_data)
Mutate function was used to create 2 new variables. 1)Strike rate calculation 2) Extras per match calculation
joined_data <- joined_data %>% group_by(batsman) %>% mutate(batsman_totalrun = sum(total_runs), strike_rate = mean(batsman_runs) * 100)
head(joined_data)
joined_data <- joined_data %>% group_by(bowling_team,Season,date) %>% mutate(extras = sum(extra_runs))
head(joined_data)
NA
Missing values were checked using colsums and replaced using mode and other respective data
colSums(is.na(joined_data))
match_id Season city date
0 0 3400 0
team1 team2 toss_winner toss_decision
0 0 0 0
result dl_applied winner player_of_match
0 0 744 744
venue umpire1 umpire2 umpire3
0 1000 1000 301424
inning batting_team bowling_team over
0 0 0 0
ball batsman non_striker bowler
0 0 0 0
is_super_over wide_runs bye_runs legbye_runs
0 0 0 0
noball_runs penalty_runs batsman_runs extra_runs
0 0 0 0
total_runs player_dismissed dismissal_kind fielder
0 340488 340488 345260
win_by count batsman_totalrun strike_rate
0 0 0 0
extras
0
#Imputing missing values
joined_data$city <-impute(joined_data$city, fun = mode)
joined_data$toss_decision <-impute(joined_data$toss_decision, fun = mode)
joined_data$dismissal_kind <-impute(joined_data$dismissal_kind, fun = mode)
joined_data$dismissal_kind <- impute(joined_data$dismissal_kind , fun = mode )
joined_data$player_of_match [is.na(joined_data$player_of_match)] <- "No"
joined_data$winner [is.na(joined_data$winner)] <- "No"
invalid factor level, NA generated
joined_data$umpire1 [is.na(joined_data$umpire1)] <- "No data"
joined_data$umpire2 [is.na(joined_data$umpire2)] <- "No data"
joined_data$umpire3 [is.na(joined_data$umpire3)] <- "No data"
joined_data$player_dismissed [is.na(joined_data$player_dismissed)] <- "No"
joined_data$fielder [is.na(joined_data$fielder)] <- "Not caught"
colSums(is.na(joined_data))
match_id Season city date
0 0 0 0
team1 team2 toss_winner toss_decision
0 0 0 0
result dl_applied winner player_of_match
0 0 744 0
venue umpire1 umpire2 umpire3
0 0 0 0
inning batting_team bowling_team over
0 0 0 0
ball batsman non_striker bowler
0 0 0 0
is_super_over wide_runs bye_runs legbye_runs
0 0 0 0
noball_runs penalty_runs batsman_runs extra_runs
0 0 0 0
total_runs player_dismissed dismissal_kind fielder
0 0 0 0
win_by count batsman_totalrun strike_rate
0 0 0 0
extras
0
#checking for special values
is.special <- function(x){
if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}
sapply(joined_data, function(x) sum( is.special(x)))
match_id Season city date
0 0 0 0
team1 team2 toss_winner toss_decision
0 0 0 0
result dl_applied winner player_of_match
0 0 0 0
venue umpire1 umpire2 umpire3
0 0 0 0
inning batting_team bowling_team over
0 0 0 0
ball batsman non_striker bowler
0 0 0 0
is_super_over wide_runs bye_runs legbye_runs
0 0 0 0
noball_runs penalty_runs batsman_runs extra_runs
0 0 0 0
total_runs player_dismissed dismissal_kind fielder
0 0 0 0
win_by count batsman_totalrun strike_rate
0 0 0 0
extras
0
colSums(is.na(joined_data))
match_id Season city date
0 0 0 0
team1 team2 toss_winner toss_decision
0 0 0 0
result dl_applied winner player_of_match
0 0 744 0
venue umpire1 umpire2 umpire3
0 0 0 0
inning batting_team bowling_team over
0 0 0 0
ball batsman non_striker bowler
0 0 0 0
is_super_over wide_runs bye_runs legbye_runs
0 0 0 0
noball_runs penalty_runs batsman_runs extra_runs
0 0 0 0
total_runs player_dismissed dismissal_kind fielder
0 0 0 0
win_by count batsman_totalrun strike_rate
0 0 0 0
extras
0
##Scan II Checking for outliers for strike rate,extras using boxplot since these are our variables of focus
#boxplot for strike rate
joined_data$strike_rate %>% boxplot(main = "Boxplot for strike rate of batsman",
ylab = "Per 100 balls")
#boxplot for batsman_total run
joined_data$batsman_totalrun %>% boxplot(main = "Boxplot for total runs by batsman",
ylab = "runs by batsman in total")
#boxplot for extras bowled
joined_data$extras %>% boxplot(main = "Boxplot for extra runs given",
ylab = "extra runs in match")
head(joined_data)
#using zscores to eliminate outliers but it did not work after a glimpse of changed data. So went for capping
z.scores <- joined_data$strike_rate %>% scores(type = "z")
joined_clean1<- joined_data$strike_rate[ - which( abs(z.scores) >3 )]
#boxplot for strike rate
joined_data$strike_rate %>% boxplot(main = "Boxplot for strike rate of batsman",
ylab = "Per 100 balls")
#Capping these 2 variables alone since outliers were present
capping <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
#capping the outliers present for home_win_percentage and away_win_percentage
joined_data$batsman_runs <- joined_data$batsman_runs %>% capping()
joined_data$extra_runs <- joined_data$extra_runs %>% capping()
joined_data <- joined_data %>% group_by(batsman) %>% mutate(batsman_totalrun = sum(total_runs), strike_rate = mean(batsman_runs) * 100)
joined_data <- joined_data %>% group_by(bowling_team,Season,date) %>% mutate(extras = sum(extra_runs))
head(joined_data)
NA
Using all transformation techniques for strike rate and extras. Min_max turned out to be the best
hist(joined_data$strike_rate)
hist(joined_data$extras)
hist(joined_data$batsman_totalrun)
log_strike <- log10(joined_data$strike_rate)
hist(log_strike)
log_strike <- log(joined_data$strike_rate)
hist(log_strike)
sqrt_strike <- sqrt(joined_data$strike_rate)
hist(sqrt_strike)
minMaxMethod <- function(x) {
(x-min(x)) / (max(x) - min(x))
}
#transforming the variable home_away_win_ratio using min-max transformation
joined_data$strike_rate <- joined_data$strike_rate %>% minMaxMethod()
hist(joined_data$strike_rate)