library(ggplot2)
library(tidyr)
library(dplyr)
library(knitr)
library(highcharter)
library(httr)
library(psych)
url <- "https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/final-proj/E0.csv"
epl <- read.csv(url, sep=",", header=T)
str(epl)
head(epl)
#We have to divide information into Home and Away matches for Leicester and Others (excluding Leicester) for correct comparison.
epl_home_lei <- epl[epl$HomeTeam == "Leicester",]
epl_away_lei <- epl[epl$AwayTeam == "Leicester",]
epl_lei <- epl[epl$HomeTeam == "Leicester" | epl$AwayTeam == "Leicester",]
epl_not_lei <- epl[epl$HomeTeam != "Leicester" & epl$AwayTeam != "Leicester",]
The year 2015-2016 is particular interesting since under-dog Leicester City F.C won a title in English Primere League for the first time in their histroy. The team will be analyzed in particular detail (dividing their performances into home and away games compared to other counterparts) and will be compared to other teams to understand the particular strategies they used (e.g. did they rely more on corner kicks, triggered other teams to receive more yellow cards by playing aggressively and etc). And then, statistical analyses such as hypothesis testing, regression and etc will be used with particular variables to fully understand what helped team Leicester win matches on average.
To understand which particular variables contributed goal scoring chances for Leicester.
Each case represents a match in EPL in 2015-2016. There are 380 observations in data set.
Data is submitted monthly. It is collected by Football-Data and API.
It is an obeservational study.
Data is collected by Football-Data and it is freely available: http://www.football-data.co.uk/italym.php. For this project, CSV file for 2015-2016 was downloaded and then uploaded into R. Second data source is from https://www.football-data.org. It provides API for major European matches between 2015 and 2016 and similar information as the above.
The response variable is number of full time goals in Home and Away games and they are numerical.
The explanatory variables are number of corners taken (home/away), fouls commited (home/away), number of yellow cards received (home/away) and shots on goal (home/away) and they are numerical.
##Comparing Leicester (Home/Away) vs other teams (Home/Away)
#Leicester had higher number of corner kicks (median) than other teams when teams were playing home. For mean, they are the same.
#Leicester had higher number of corner kicks (mean) than other teams when teams were playing away. For meadian, they are the same.
#Leicester committed less fouls in both home and away games compared to other teams, on average, but recieved more yellow cards in away games.
#Leicester had less shots on target in home games, on average and in terms of median, compared to other teams when played home. Notice the max is 9 where as it is 14 for other teams which played home showing that HST for Leicester is smaller than other teams when they played home. When Leicester played away, they had both higher mean and median AST than other teams that played away.
#In summary, data shows that Leicester tended to play more aggressively than other teams in away games (higher AST, AY, AC on average) but less aggressively in home games. They took different path; usually you play lot more aggressively in home games than away games. Maybe this is the reason why they made a biggest upset?
epl_lei_summary <- describe(epl_lei)
epl_lei_summary <- data.frame(epl_lei_summary)
epl_lei_summary[c("HC","AC", "HF", "AF", "HY", "AY", "HST", "AST"),]
## vars n mean sd median trimmed mad min max range
## HC 18 38 5.947368 4.006396 6.0 5.62500 4.4478 0 16 16
## AC 19 38 5.526316 2.738483 5.0 5.37500 2.9652 1 13 12
## HF 16 38 9.210526 3.669877 9.0 9.12500 2.9652 3 16 13
## AF 17 38 11.263158 3.326261 11.0 11.15625 2.9652 4 19 15
## HY 20 38 1.315789 1.141485 1.0 1.21875 1.4826 0 4 4
## AY 21 38 1.684211 1.232560 1.0 1.62500 1.4826 0 4 4
## HST 14 38 4.289474 2.025618 4.0 4.21875 2.9652 1 9 8
## AST 15 38 4.236842 2.283125 4.5 4.15625 2.2239 0 12 12
## skew kurtosis se
## HC 0.7495968 -0.05733627 0.6499233
## AC 0.5950119 0.15491122 0.4442406
## HF 0.1208187 -0.78601759 0.5953326
## AF 0.2475008 0.08478109 0.5395908
## HY 0.5507925 -0.46644403 0.1851733
## AY 0.4279195 -0.89687504 0.1999476
## HST 0.2413884 -0.63326946 0.3285987
## AST 0.7904077 1.52803031 0.3703718
epl_not_lei_summary <- describe(epl_not_lei)
epl_not_lei_summary <- data.frame(epl_not_lei_summary)
epl_not_lei_summary[c("HC","AC", "HF", "AF", "HY", "AY", "HST", "AST"),]
## vars n mean sd median trimmed mad min max range
## HC 18 342 5.947368 3.171100 5 5.751825 2.9652 0 18 18
## AC 19 342 4.815789 2.489506 5 4.686131 2.9652 0 14 14
## HF 16 342 10.152047 3.408672 10 10.036496 2.9652 2 22 20
## AF 17 342 11.485380 3.422124 11 11.383212 2.9652 4 22 18
## HY 20 342 1.461988 1.152590 1 1.372263 1.4826 0 6 6
## AY 21 342 1.652047 1.274143 2 1.565693 1.4826 0 9 9
## HST 14 342 4.666667 2.637425 5 4.496350 2.9652 0 14 14
## AST 15 342 3.842105 2.152182 4 3.733577 1.4826 0 11 11
## skew kurtosis se
## HC 0.6812439 0.54151646 0.17147343
## AC 0.5111239 0.07461875 0.13461705
## HF 0.3363233 0.06067512 0.18431986
## AF 0.2805983 -0.13552178 0.18504725
## HY 0.6768259 0.23301126 0.06232491
## AY 0.8895367 2.33379574 0.06889779
## HST 0.6392195 0.29972495 0.14261562
## AST 0.5524674 0.19856027 0.11637668
#Now, let's examine number of goals by Home Vs Away games for Leicester.
describe(epl_lei[c("FTHG", "FTAG")])
## vars n mean sd median trimmed mad min max range skew kurtosis
## FTHG 1 38 1.39 1.08 1 1.31 1.48 0 4 4 0.58 -0.20
## FTAG 2 38 1.34 1.17 1 1.25 1.48 0 5 5 0.82 0.54
## se
## FTHG 0.18
## FTAG 0.19
describe(epl_not_lei[c("FTHG", "FTAG")])
## vars n mean sd median trimmed mad min max range skew kurtosis
## FTHG 1 342 1.50 1.28 1 1.38 1.48 0 6 6 0.81 0.46
## FTAG 2 342 1.19 1.15 1 1.05 1.48 0 6 6 0.93 0.70
## se
## FTHG 0.07
## FTAG 0.06
#We can say that Leicester, just like we expected from above analysis, had higher FTAG on average when it comes to comparing away games. However, note that mean of FTHG is smaller for Leicester.
#Let's visualize the Full time goal results
epl_lei_goal <- gather(epl_lei, team_h_a, goals, FTHG:FTAG)
epl_lei_goal <- epl_lei_goal[c("team_h_a","goals")]
epl_lei_not_goal <- gather(epl_not_lei, team_h_a, goals, FTHG:FTAG)
epl_lei_not_goal <- epl_lei_not_goal[c("team_h_a","goals")]
table(epl_lei_goal, useNA = 'ifany')
## goals
## team_h_a 0 1 2 3 4 5
## FTAG 10 13 9 5 0 1
## FTHG 8 14 11 3 2 0
table(epl_lei_not_goal, useNA = 'ifany')
## goals
## team_h_a 0 1 2 3 4 5 6
## FTAG 113 112 72 32 10 2 1
## FTHG 84 107 80 47 17 4 3
epl_lei_goal_h <- epl_lei_goal[epl_lei_goal$team_h_a == "FTHG",]
epl_l_h <- prop.table(table(epl_lei_goal_h$goals, useNA='ifany')) * 100
epl_l_h <- data.frame(epl_l_h)
colnames(epl_l_h) <- c("goals", "freq.leicester")
epl_l_h$freq.leicester <- round(epl_l_h$freq.leicester,2)
epl_lei_not_goal_h <- epl_lei_not_goal[epl_lei_not_goal$team_h_a == "FTHG",]
epl_l_n_h <- prop.table(table(epl_lei_not_goal_h$goals, useNA='ifany')) * 100
epl_l_n_h <- data.frame(epl_l_n_h)
colnames(epl_l_n_h) <- c("goals", "freq.others")
epl_l_n_h$freq.others <- round(epl_l_n_h$freq.others,2)
a <- merge(epl_l_h, epl_l_n_h, all = TRUE)
a <- data.frame(a)
a
## goals freq.leicester freq.others
## 1 0 21.05 24.56
## 2 1 36.84 31.29
## 3 2 28.95 23.39
## 4 3 7.89 13.74
## 5 4 5.26 4.97
## 6 5 NA 1.17
## 7 6 NA 0.88
epl_lei_goal_a <- epl_lei_goal[epl_lei_goal$team_h_a == "FTAG",]
epl_l_a <- prop.table(table(epl_lei_goal_a$goals, useNA='ifany')) * 100
epl_l_a <- data.frame(epl_l_a)
colnames(epl_l_a) <- c("goals", "freq.leicester")
epl_l_a$freq.leicester <- round(epl_l_a$freq.leicester,2)
epl_lei_not_goal_a <- epl_lei_not_goal[epl_lei_not_goal$team_h_a == "FTAG",]
epl_l_n_a <- prop.table(table(epl_lei_not_goal_a$goals, useNA='ifany')) * 100
epl_l_n_a <- data.frame(epl_l_n_a)
colnames(epl_l_n_a) <- c("goals", "freq.others")
epl_l_n_a$freq.others <- round(epl_l_n_a$freq.others,2)
b <- merge(epl_l_a, epl_l_n_a, all = TRUE)
b <- data.frame(b)
b
## goals freq.leicester freq.others
## 1 0 26.32 33.04
## 2 1 34.21 32.75
## 3 2 23.68 21.05
## 4 3 13.16 9.36
## 5 5 2.63 0.58
## 6 4 NA 2.92
## 7 6 NA 0.29
##Lei vs Others, home FTHG
aa <- highchart() %>%
hc_yAxis_multiples(
list(lineWidth = 3),
list(lineWidth = 3)
) %>%
hc_add_series(name = "Leicester", data = a$freq.leicester, color='blue', type = "column") %>%
hc_add_series(name = "Others", data = a$freq.others, color='green', type = "column") %>%
hc_xAxis(categories = a$goals, title = list(text = c("goals")))
aa
sort(b$goals, decreasing = FALSE)
## [1] 0 1 2 3 5 4 6
## Levels: 0 1 2 3 5 4 6
##Lei vs Others, Away FTHG
bb <- highchart() %>%
hc_yAxis_multiples(
list(lineWidth = 3),
list(lineWidth = 3)
) %>%
hc_add_series(name = "Leicester", data = b$freq.leicester, color='blue', type = "column") %>%
hc_add_series(name = "Others", data = b$freq.others, color='green', type = "column") %>%
hc_xAxis(categories = b$goals, title = list(text = c("goals")))
bb
#We can see that Leicester tend to score more when you compare teams in Away. Indeed, they are doing better than others in Away vs Away comparison.
#check if number of matches are indeed 380 from other data source
url = 'api.football-data.org/'
request = 'v1/competitions/?season=2015'
headers = c('X-Auth-Token', '6c904354c704496bae14af32ceeff5a8')
a = GET(paste(url,request,sep=""),
add_headers(headers),
add_headers('X-Response-Control','minified'))
repo <- content(a)
this.content.df <- do.call(what = "rbind",
args = lapply(repo[5], as.data.frame))
this.content.df
## X_links.href
## 1 http://api.football-data.org/v1/competitions/398
## X_links.href.1
## 1 http://api.football-data.org/v1/competitions/398/teams
## X_links.href.2
## 1 http://api.football-data.org/v1/competitions/398/fixtures
## X_links.href.3 id
## 1 http://api.football-data.org/v1/competitions/398/leagueTable 398
## caption league year currentMatchday numberOfMatchdays
## 1 Premier League 2015/16 PL 2015 38 38
## numberOfTeams numberOfGames lastUpdated
## 1 20 380 2017-07-30T07:10:13Z
#The data from this API is pretty much very similar to football-data.org. Either use this or from football-data directly.
#From the analysis above, we know that Leicester tend to attack aggresively Away vs Away. We want to know if they were attacking efficiently in away games. We know that FTHG is rather smaller for Leicester but that does not mean they were doing worse than Others. In fact, we will have to figure out how "effective" they were attacking compared to others; hence, we will have to examine the AST(HST)/FTAG(FTHG) ratio. If ratio is generally lower for Leicester, we can say that they were doing better than Others when it comes to converting shots into actual goals. To figure out this, we will have to run multiple regressions.
###Away vs Away
reg_l_a <- lm(FTAG ~ AC + AF + AY + AST, data = epl_away_lei)
summary(reg_l_a)
##
## Call:
## lm(formula = FTAG ~ AC + AF + AY + AST, data = epl_away_lei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3124 -0.3038 -0.1364 0.5536 1.2102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.500367 1.011674 -0.495 0.62856
## AC -0.037571 0.095860 -0.392 0.70101
## AF 0.068182 0.061348 1.111 0.28512
## AY 0.004028 0.182013 0.022 0.98266
## AST 0.336473 0.111423 3.020 0.00918 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7969 on 14 degrees of freedom
## Multiple R-squared: 0.4332, Adjusted R-squared: 0.2713
## F-statistic: 2.675 on 4 and 14 DF, p-value: 0.07578
#P-value for AST is less than 0.05, statistically significant. Coefficient is 0.336. Surprisingly, other variables are not statistically significant for FTAG (Leicester).
reg_o_a <- lm(FTAG ~ AC + AF + AY + AST, data = epl_not_lei)
summary(reg_o_a)
##
## Call:
## lm(formula = FTAG ~ AC + AF + AY + AST, data = epl_not_lei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1791 -0.6534 -0.1324 0.5626 3.2316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3689382 0.2169772 1.700 0.090 .
## AC -0.0643515 0.0215257 -2.990 0.003 **
## AF 0.0007526 0.0164621 0.046 0.964
## AY -0.0497662 0.0442732 -1.124 0.262
## AST 0.3142858 0.0249260 12.609 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9492 on 337 degrees of freedom
## Multiple R-squared: 0.3212, Adjusted R-squared: 0.3131
## F-statistic: 39.87 on 4 and 337 DF, p-value: < 2.2e-16
#P-Value for AST and AC are less than 0.05, statistically significant. Coefficient for AC is -0.064 where as for AST, it is 0.314. Quite surprising that AC has negative relationship with FTAG (Others).
###Home vs Home
reg_l_h <- lm(FTHG ~ HC + HF + HY + HST, data = epl_home_lei)
summary(reg_l_h)
##
## Call:
## lm(formula = FTHG ~ HC + HF + HY + HST, data = epl_home_lei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55444 -0.27431 -0.06186 0.52084 1.09033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.45059 0.67460 -0.668 0.515040
## HC 0.02579 0.06176 0.418 0.682549
## HF 0.04544 0.06190 0.734 0.474996
## HY -0.03119 0.21574 -0.145 0.887128
## HST 0.37765 0.08879 4.253 0.000803 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7706 on 14 degrees of freedom
## Multiple R-squared: 0.661, Adjusted R-squared: 0.5642
## F-statistic: 6.825 on 4 and 14 DF, p-value: 0.002894
#P-value for HST is less than 0.05, statistically significant. Coefficient is 0.336. Surprisingly, other variables are not statistically significant for FTAG (Leicester).
reg_o_h <- lm(FTHG ~ HC + HF + HY + HST, data = epl_not_lei)
summary(reg_o_h)
##
## Call:
## lm(formula = FTHG ~ HC + HF + HY + HST, data = epl_not_lei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4550 -0.7043 -0.0591 0.5541 3.7088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.20629 0.22877 5.273 2.41e-07 ***
## HC -0.12950 0.01813 -7.143 5.66e-12 ***
## HF -0.02480 0.01778 -1.395 0.164
## HY -0.06699 0.05199 -1.289 0.198
## HST 0.30355 0.02136 14.211 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.001 on 337 degrees of freedom
## Multiple R-squared: 0.3938, Adjusted R-squared: 0.3866
## F-statistic: 54.73 on 4 and 337 DF, p-value: < 2.2e-16
#P-value for HST and HC are less than 0.05, statistically significant. Coefficient for HC is -0.1295 where as for HST, it is 0.304. Quite surprising that HC has negative relationship with FTHG (Others).
##shots on goals: Leceister vs Others, on average.
##Home vs Home
describe(epl_home_lei$HST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 19 4.68 2.31 5 4.65 2.97 1 9 8 0.21 -1.17 0.53
describe(epl_home_lei$FTHG)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 19 1.84 1.17 2 1.82 1.48 0 4 4 0.29 -0.88 0.27
shot_goal_ratio1 <- 4.68/1.84
describe(epl_not_lei$HST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 342 4.67 2.64 5 4.5 2.97 0 14 14 0.64 0.3 0.14
describe(epl_not_lei$FTHG)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 342 1.5 1.28 1 1.38 1.48 0 6 6 0.81 0.46 0.07
shot_goal_ratio2 <- 4.67/1.5
##Away vs Away
describe(epl_away_lei$AST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 19 4.84 1.74 5 4.88 2.97 2 7 5 -0.19 -1.36 0.4
describe(epl_away_lei$FTAG)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 19 1.74 0.93 2 1.76 1.48 0 3 3 0.11 -1.32 0.21
shot_goal_ratio3 <- 4.84/1.74
describe(epl_not_lei$AST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 342 3.84 2.15 4 3.73 1.48 0 11 11 0.55 0.2 0.12
describe(epl_not_lei$FTAG)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 342 1.19 1.15 1 1.05 1.48 0 6 6 0.93 0.7 0.06
shot_goal_ratio4 <- 3.84/1.19
##Visualizing shots/goal
home_ratio <- rbind(shot_goal_ratio1,shot_goal_ratio2)
away_ratio <- rbind(shot_goal_ratio3,shot_goal_ratio4)
cc <- cbind(home_ratio,away_ratio)
cc <- data.frame(cc)
colnames(cc) <- c("home", "away")
rownames(cc) <- c("Leicester", "Others")
cc <- round(cc,2)
ccc <- highchart() %>%
hc_yAxis_multiples(
list(lineWidth = 3),
list(lineWidth = 3)
) %>%
hc_add_series(name = "home", data = cc$home, color='blue', type = "column") %>%
hc_add_series(name = "away", data = cc$away, color='green', type = "column") %>%
hc_xAxis(categories = rownames(cc), title = list(text = c("shots/goal")))
ccc
##In conclusion, we know that for both Home and Away, Leicester had higher coefficient for HST and AST for FTAG and FTAG which indicate that Leicester's shot on target led to goals more often than Others. Indeed, Leicester were very clinical and efficient teams. Surprisingly, both HC and AC were not useful when it comes to goal socring for Leicester. In fact, we know that Leicester had higher AC, on average, for Away vs Away comparison, but this did not necessarily lead into goal scoring at all as P-value was larger than 0.05. Indeed, Leicester could score more goals and win matches by simply being more clinical and efficient in terms of shot on goal per full time goal ratio.