Comparing the performance of Club “Bayern Munchen” as a home team and as a visitor team using probability distribution for goals scored.
#install.packages("repmis")
library(repmis)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
** Data has been imported from github using package “repmis”
source_data("https://github.com/jalapic/engsoccerdata/blob/master/data/germany.rda?raw=True")
## Downloading data from: https://github.com/jalapic/engsoccerdata/blob/master/data/germany.rda?raw=True
## SHA-1 hash of the downloaded data file is:
## 67c0216617db354ac7a51a24362cecc9ac752984
## [1] "germany"
main_data<-as.data.frame(germany) #data with 16120 rows from 1963-2016
Variable Description
Two Dataframes are made by filtering imported data on home and visitor column for Bayern Munchen
Home_Team<-"Bayern Munchen"
Visitor_Team<-"Bayern Munchen"
Data_Club_as_Home<-main_data[main_data$home==Home_Team,]
Data_Club_as_Visitor<-main_data[main_data$visitor==Visitor_Team,]
In order to proceed further, first of all we need to investigate which distribution our data follows. As our problem statement is based on calculaing probability of scoring goals as home team and visitor team, therefore, we will consider Poisson distribution rather than any other distribution.
We worked on 3 cases to prove our consideration of Poisson distribution
Case 1. Assumptions for Poisson Distribution :
As our data meets the aforementioned assumptions, hence, K (number of goals in a match) is a Poisson random variable, and the distribution is a Poisson distribution.
Case 2. The mean of goals scored is almost same as variance of goals scored when Bayern Munchen played as Home Team and as Visitor Team, this also favours poisson distribution.
Main_data_Bayern_home_goal<-Data_Club_as_Home[,6]
home_mean<- round(Main_data_Bayern_home_goal %>% mean(),2)
home_var<-round(Main_data_Bayern_home_goal %>% var(),2)
home_sum<-round(Main_data_Bayern_home_goal %>% sum(),2)
Main_data_Bayern_visitor_goal<-Data_Club_as_Visitor[,7]
visitor_mean<-round(Main_data_Bayern_visitor_goal %>% mean(),2)
visitor_var<-round(Main_data_Bayern_visitor_goal %>% var(),2)
visitor_sum<-Main_data_Bayern_visitor_goal %>% sum()
stats_data_home<-rbind(home_sum,home_mean,home_var)
stats_data_visitor<-rbind(visitor_sum,visitor_mean,visitor_var)
statistic_name<-c("Sum","Mean","Variance")
stats_data<-cbind(statistic_name,stats_data_home,stats_data_visitor) %>% as.data.frame()
colnames(stats_data)<-c("Statistic","As Home","As Visitor")
rownames(stats_data)<-1:3
stats_data
Case 3. We will compare Theoretical Distribution (using poisson function dpois,lambda=mean) and Empirical Distribution (by using probability formula, eg number of matches in which goals scored is 0/total number of matches….so on…till 10) as home team and visitor team separately.
—–> EMPIRICAL VS THEORETICAL DISTRIBUTION (AS HOME)
Empirical & Theoretical Probability Function as Home
Home_Team_data<-Data_Club_as_Home
plot_df_practical_home<-data.frame(c(0:10))
for (i in 1:11){
plot_df_practical_home[i,2]<-round((length(Home_Team_data[Home_Team_data$hgoal==i-1,6])/nrow(Home_Team_data))*100,2)
}
plot_df_theoretical_home<-data.frame(c(0:10))
for (i in 1:11){
plot_df_theoretical_home[i,2]<-round(dpois(i-1,home_mean)*100,2)
}
plot_df_theoretical_home_error<-data.frame(c(0:10))
for (i in 1:11){
plot_df_theoretical_home_error[i,2]<-(plot_df_theoretical_home[i,2]-plot_df_practical_home[i,2])^2
}
names(plot_df_theoretical_home)<-c("Goals","Theoretical Probability Scoring as Home")
names(plot_df_practical_home)<-c("Goals","Empirical Probability Scoring as Home")
names(plot_df_theoretical_home_error)<-c("Goals","Error Square")
df_plot_prac_theo_home<-cbind.data.frame(plot_df_practical_home,plot_df_theoretical_home$`Theoretical Probability Scoring as Home`,plot_df_theoretical_home_error$`Error Square`)
names(df_plot_prac_theo_home)<-c("Goals","Emp. Prob. Home","Theo. Prob. Home","Error Square")
df_plot_prac_theo_home
Comparison of Empirical & Theoretical Distribution As Home on single plot
plot(plot_df_practical_home$Goals,plot_df_practical_home[,2],ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",type="l",col="black",lwd=2,main=paste(Home_Team," Empirical vs Theoretical Prob. Distribution (Home)"))
lines(plot_df_theoretical_home$Goals,plot_df_theoretical_home[,2],ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",col="#0066ff",lwd=2)
legend('topright', c("Empirical Distribution","Theoretical Distribution") ,
lty=1, col=c( "black","#0066ff"), bty='n', cex=.75)
grid()
—–> EMPIRICAL VS THEORETICAL DISTRIBUTION (AS VISITOR)
Empirical & Theoretical Probability Function as Visitor
Visitor_Team_data<-Data_Club_as_Visitor
plot_prac_theo_visitor<-data.frame(c(0:10))
for (i in 1:11){
plot_prac_theo_visitor[i,2]<-round((length(Visitor_Team_data[Visitor_Team_data$hgoal==i-1,6])/nrow(Visitor_Team_data))*100,2)
}
for (i in 1:11){
plot_prac_theo_visitor[i,3]<-round(dpois(i-1,visitor_mean)*100,2)
}
for (i in 1:11){
plot_prac_theo_visitor[i,4]<-(plot_prac_theo_visitor[i,3]-plot_prac_theo_visitor[i,2])^2
}
names(plot_prac_theo_visitor)<-c("Goals","Emp. Prob. Visitor","Theo. Prob. Visitor","Error Square")
plot_prac_theo_visitor
Comparison of Empirical & Theoretical Distribution As Visitor on single plot
plot(plot_prac_theo_visitor$Goals,plot_prac_theo_visitor[,2],ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",type="l",col="black",lwd=2,main=paste(Home_Team," Empirical vs Theoretical Prob. Distribution (Visitor)"))
lines(plot_prac_theo_visitor$Goals,plot_prac_theo_visitor[,3],ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",col="#cc9900",lwd=2)
legend('topright', c("Empirical Distribution","Theoretical Distribution") ,
lty=1, col=c("black","#cc9900"), bty='n', cex=.75)
grid()
Error Analysis
Home_team_prob_Error<-round(df_plot_prac_theo_home[,4]%>%mean()%>%sqrt(),2)
Visitor_Team_prob_error<-round(plot_prac_theo_visitor[,4]%>%mean()%>%sqrt(),2)
Error_df<-cbind.data.frame(Home_team_prob_Error,Visitor_Team_prob_error)
names(Error_df)<-c("As Home Team","As Visitor Team")
rownames(Error_df)<-"Error %"
Error_df
As all 3 cases nearly favours Poisson Distribution, hence we will use Poisson Distribution for the interpretation of our problem statement.
Empirical Data Distribution (As Home vs As Visitor)
Home_Team<-"Bayern Munchen"
plot_df_practical<-data.frame(c(0:10))
for (i in 1:11){
plot_df_practical[i,2]<-round(dpois(i-1,home_mean)*100,2)
}
for (i in 1:11){
plot_df_practical[i,3]<-round(dpois(i-1,visitor_mean)*100,2)
}
names(plot_df_practical)<-c("Goals","Probability of Home Team Scoring","Probability of Visitor Team Scoring")
plot(plot_df_practical$Goals,plot_df_practical$`Probability of Home Team Scoring`,ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",type="l",col="#0066ff",lwd=2,main=paste(Home_Team," Probability Distribution as Home v/s Visitor"))
lines(plot_df_practical$Goals,plot_df_practical$`Probability of Visitor Team Scoring`,ylim=c(0,60),ylab="Probability (%)",xlab="Number of Goals",col="#cc9900",lwd=2)
legend('topright', c("Probability As Home Team","Probability As Visitor Team") ,
lty=1, col=c("#0066ff","#cc9900"), bty='n', cex=.75)
grid()
### Insights
Majorly 2 insights can be derived from the above plot,