Load libraries
suppressMessages(library(ggplot2))
suppressMessages(library(ggthemes))
# This library allows us to split up a dataframe and apply a function
suppressMessages(library(dplyr))
# Set the theme to theme_solarized where the font size is 20
theme_set(theme_solarized(20))
Load data into a dataframe
epldf <- read.csv('EPL13-14.csv', header = TRUE)
Create a bar plot of referees v/s number of games officiated
# Refer to the Header.txt files to get the full acronyms of the variables
ggplot(aes(x = Referee), data = epldf) +
geom_bar(fill = "orange") +
xlab('Referee') +
ylab('Number of Games Officiated')

ggsave('refereestats.jpg')
## Saving 22 x 14 in image
Create a scatter plot of Home Team Shots v/s Away Team Shots
# HS: Home Team Shots
# AS: Away Team Shots
ggplot(aes(x = HS, y = AS), data = epldf) +
geom_point(color = "blue") +
scale_x_continuous(breaks = seq(0, max(epldf$HS), 2)) +
scale_y_continuous(breaks = seq(0, max(epldf$AS), 2)) +
xlab('Home Team Shots') +
ylab('Away Team Shots')

ggsave('homeawayshotsscatter.jpg')
## Saving 7 x 5 in image
Create a scatter plot of Home Team Shots v/s Away Team Shots(add some jitter)
# HS: Home Team Shots
# AS: Away Team Shots
ggplot(aes(x = HS, y = AS), data = epldf) +
geom_jitter(color = "#254117") +
scale_x_continuous(breaks = seq(0, max(epldf$HS), 2)) +
scale_y_continuous(breaks = seq(0, max(epldf$AS), 2)) +
geom_smooth() +
xlab('Home Team Shots') +
ylab('Away Team Shots')
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggsave('homeawayshotsjitter.jpg')
## Saving 7 x 5 in image
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# Running the following test shows that the co-efficient is negative
# The correlation coefficient is not strong though
cor.test(epldf$HS, epldf$AS, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: epldf$HS and epldf$AS
## t = -9.6058, df = 378, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5203667 -0.3583190
## sample estimates:
## cor
## -0.4429534
This bar plot shows the most number of goals scored at home by a team
# Set stat = "identity" to specify the y-axis
p1 <- ggplot(aes(x = HomeTeam, y = FTHG), data = epldf) +
geom_bar(stat = "identity", fill = "#6C2DC7") +
xlab("Team") +
ylab("Goals Scored at Home")
ggsave('hometeamscored.jpg')
## Saving 7 x 5 in image
This bar plot shows the most number of goals conceded at home by a team
# Set stat = "identity" to specify the y-axis
p2 <- ggplot(aes(x = HomeTeam, y = FTAG), data = epldf) +
geom_bar(stat = "identity", fill = "#6C2DC7") +
xlab("Team") +
ylab("Goals Conceded at Home")
ggsave('hometeamconceded.jpg')
## Saving 7 x 5 in image
This bar plot shows the most number of goals scored by a team away from home
# Set stat = "identity" to specify the y-axis
p3 <- ggplot(aes(x = AwayTeam, y = FTAG), data = epldf) +
geom_bar(stat = "identity", fill = "#6C2DC7") +
xlab("Team") +
ylab("Goals Scored Away from Home")
ggsave('hometeamscored.jpg')
## Saving 7 x 5 in image
Arrange the plots in a grid
# Arrange the plots
suppressMessages(library(gridExtra))
grid.arrange(p1, p2, p3)
