In this document, we are going to analyze IPL data till 2017.
We have 2 datasets available as “matches” and “Deliveries”
We joined both datasets,cleaned and grouped it for analysis.

TOSS KA BOSS

table(toss_r$toss_decision)
## 
##   bat field 
##   273   363
toss_r%>%filter(city=="Mumbai"|city=="Bangalore"|city=="Delhi"|city=="Chennai"|city=="Kolkata"|city=="Jaipur"|city=="Hyderabad"|city=="Chandigarh"|city=="Pune")%>%ggplot()+geom_bar(aes(x=toss_decision,fill=toss_winner))+facet_wrap(~city)

How many times a team won the toss won the match?:-

winner=toss_r%>%filter(toss_winner==winner)
match_winner_after_winning_toss=nrow(winner)
loser=toss_r%>%filter(toss_winner!=winner)
loosing_match_after_wining_toss=nrow(loser)
data.frame(match_winner_after_winning_toss,loosing_match_after_wining_toss)
##   match_winner_after_winning_toss loosing_match_after_wining_toss
## 1                             325                             311
  • Team winning toss won 325 matches and lost 310 matches means 51% team winning the toss won the match. This shows approx 50-50 chances to win a game independent on toss result.

Who won more matches, batting 1st or batting 2nd:-

data.frame(batting_1st,batting_2nd)
##   batting_1st batting_2nd
## 1         284         349
  • Team batting 2nd won 349 matches compared to 284 for batting first. 55% of times team batting 2nd are winning the game. This is maybe due to dew in 8 PM games which cause difficulty in bowling under lights.

RUNS KI GAME

Maximum runs in a season:-

highest_runs=by_players%>%arrange(desc(batsman_run))
head(highest_runs)
## # A tibble: 6 x 5
## # Groups:   season [3]
##   season batsman        batsman_run balls_played   srr
##    <int> <fct>                <int>        <int> <dbl>
## 1   2016 V Kohli                973          655  149.
## 2   2016 DA Warner              848          579  146.
## 3   2012 CH Gayle               733          472  155.
## 4   2013 MEK Hussey             733          580  126.
## 5   2013 CH Gayle               720          484  149.
## 6   2016 AB de Villiers         687          415  166.

Seasonwise highest run scorers in IPL history:-

head(max_scorer)
## # A tibble: 6 x 6
##   season.x   top season.y batsman      balls_played   srr
##      <int> <int>    <int> <fct>               <int> <dbl>
## 1     2008   616     2008 SE Marsh              452  136.
## 2     2009   572     2009 ML Hayden             409  140.
## 3     2010   618     2010 SR Tendulkar          489  126.
## 4     2011   608     2011 CH Gayle              342  178.
## 5     2012   733     2012 CH Gayle              472  155.
## 6     2013   733     2013 MEK Hussey            580  126.
ggplot(max_scorer,aes(y=top,x=factor(season.x),color=batsman,size=srr))+geom_count()+geom_text(aes(label=batsman),size=5,nudge_y = 12)+
  xlab("season")+ylab("Total Runs")+coord_cartesian()

Now we will plot top 10 batsman with maximum strike rate (Atleat played 100 balls):

ggplot(sttr[1:10,],aes(x=run,y=srrt,color=batsman,size=srrt))+geom_count()+
  geom_text(aes(label=batsman),size=4,nudge_y = 0.5)

Average change in batsman’s strike rate with balls faced in his IPL career:

ggplot(sttr,aes(x=balls,y=srrt,size=srrt))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Maximum run scorers in IPL history:

highest_runs=str%>%arrange(desc(run))
ggplot(highest_runs[1:10,],aes(x=balls,y=run,color=batsman,size=srrt))+geom_count()+
  geom_text(aes(label=batsman),size=5,nudge_y = 15)

Runs scored by batsmen with balls played in their career

Black line represents strike rate of 100 or 1 run per ball equation.

ggplot(sttr,aes(x=balls,y=run,color=srrt))+geom_point()+geom_smooth()+geom_abline()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  • We can clearly see the player who play more balls has chances of scoring more runs and with better strike rate.

Runs scored by batsmen per innings:

by_match=ipl%>%group_by(season,match_id,inning,batsman)%>%summarise(bat_run=sum(batsman_runs),balls=n())
head(by_match)
## # A tibble: 6 x 6
## # Groups:   season, match_id, inning [2]
##   season match_id inning batsman         bat_run balls
##    <int>    <int>  <int> <fct>             <int> <int>
## 1   2008       60      1 BB McCullum         158    77
## 2   2008       60      1 DJ Hussey            12    12
## 3   2008       60      1 Mohammad Hafeez       5     3
## 4   2008       60      1 RT Ponting           20    20
## 5   2008       60      1 SC Ganguly           10    12
## 6   2008       60      2 AA Noffke             9    12
by_match%>%filter(inning<=2)%>%ggplot(aes(x=balls,y=bat_run,color=factor(inning),size=bat_run))+geom_point(alpha=1/2)+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

  • Highest individual scores are mostly coming in first inning,because of low pressure in first innings and in second innings many times chaseable target is not enough for a big individual score.

  • In second innings players are starting with better strike rate.

  • For less than 50 balls 2nd innings players have higher scores,maybe these are finishers.

Now lets see comaparison between runs scored by teams batting 1st and 2nd in every over of a T20 game:-

by_over=ipl%>%group_by(season,match_id,inning,bowler,over)%>%summarise(bat_run=sum(batsman_runs),balls=n(),runs=sum(total_runs))
head(by_over)
## # A tibble: 6 x 8
## # Groups:   season, match_id, inning, bowler [3]
##   season match_id inning bowler     over bat_run balls  runs
##    <int>    <int>  <int> <fct>     <int>   <int> <int> <int>
## 1   2008       60      1 AA Noffke     4      17     7    23
## 2   2008       60      1 AA Noffke     7       7     6     7
## 3   2008       60      1 AA Noffke    16       4     6     4
## 4   2008       60      1 AA Noffke    18       7     6     7
## 5   2008       60      1 CL White     15      22     7    24
## 6   2008       60      1 JH Kallis     9       4     6     4
by_over%>%filter(inning<=2)%>%ggplot(aes(x=over,y=runs,color=factor(inning)))+geom_boxplot(aes(cut_width(over,1)))

  • 1st innings batting team always start slower than 2nd team and scores less runs in powerplay.

  • Both teams score approx equal number of runs in middle overs.

  • 1st innings batting team score more runs in death overs.

Now we will compare india’s two best white ball cricketers in IPL through years:

rohit_kohli=by_players%>%filter(batsman=="RG Sharma"|batsman=="V Kohli")
head(rohit_kohli)
## # A tibble: 6 x 5
## # Groups:   season [3]
##   season batsman   batsman_run balls_played   srr
##    <int> <fct>           <int>        <int> <dbl>
## 1   2008 RG Sharma         404          276 146. 
## 2   2008 V Kohli           165          168  98.2
## 3   2009 RG Sharma         362          323 112. 
## 4   2009 V Kohli           246          225 109. 
## 5   2010 RG Sharma         404          310 130. 
## 6   2010 V Kohli           307          216 142.
ggplot(rohit_kohli,aes(y=batsman_run,x=season))+geom_point()+xlim(c("2008","2017"))+geom_path(aes(size=batsman_run,color=batsman))+
  geom_text(aes(label=season),nudge_x = .2)

WHY BOWLERS ARE SO IMPORTANT

Most Expensive overs in IPL history:

most_expensive=by_over%>%arrange(desc(runs))
most_expensive[1:15,]%>%ggplot(aes(x=over,y=runs,color=factor(season)))+geom_count()+geom_text(aes(label=bowler),nudge_y = 0.5)+
  ylim(c(25,40))

Now lets have a look at bowlers economy (bowled Min. 50 overs):

economy_history=by_over%>%group_by(bowler)%>%summarise(overs_bowled=n(),balls_bowled=sum(balls),runs_conceded=sum(runs),eco=(runs_conceded/overs_bowled))
economy_overall=economy_history%>%filter(overs_bowled>=50)%>%arrange(eco)
head(economy_overall)
## # A tibble: 6 x 5
##   bowler      overs_bowled balls_bowled runs_conceded   eco
##   <fct>              <int>        <int>         <int> <dbl>
## 1 SP Narine            323         1956          2085  6.46
## 2 R Ashwin             383         2359          2552  6.66
## 3 A Kumble             163          983          1089  6.68
## 4 GD McGrath            54          329           366  6.78
## 5 Rashid Khan           54          328           368  6.81
## 6 DL Vettori           131          785           894  6.82
ggplot(economy_overall[1:15,],aes(y=eco,x=factor(overs_bowled),color=eco))+geom_count()+geom_text(aes(label=bowler),nudge_y=0.02)+
  xlab("Overs Bowled ")+ylab("Economy")

  • Sunil Narine has best best economy in IPL history followed by R. Ashwin and Anil KUmble.
  • Spinners are leading the list of best economies,maybe because they bowl in middle overs.

Thank you so much for reading this analysis.