Load libraries

suppressMessages(library(ggplot2))
suppressMessages(library(ggthemes))
# This library allows us to split up a dataframe and apply a function
suppressMessages(library(dplyr))
# Set the theme to theme_solarized where the font size is 20
theme_set(theme_solarized(20)) 

Load data into a dataframe

epldf <- read.csv('EPL13-14.csv', header = TRUE)

Create a bar plot of referees v/s number of games officiated

# Refer to the Header.txt files to get the full acronyms of the variables
ggplot(aes(x = Referee), data = epldf) + 
  geom_bar(fill = "orange") +
  xlab('Referee') +
  ylab('Number of Games Officiated')

ggsave('refereestats.jpg')
## Saving 22 x 14 in image

Create a scatter plot of Home Team Shots v/s Away Team Shots

# HS: Home Team Shots
# AS: Away Team Shots
ggplot(aes(x = HS, y = AS), data = epldf) +
  geom_point(color = "blue") +
  scale_x_continuous(breaks = seq(0, max(epldf$HS), 2)) +
  scale_y_continuous(breaks = seq(0, max(epldf$AS), 2)) +
  xlab('Home Team Shots') +
  ylab('Away Team Shots')

ggsave('homeawayshotsscatter.jpg')
## Saving 7 x 5 in image

Create a scatter plot of Home Team Shots v/s Away Team Shots(add some jitter)

# HS: Home Team Shots
# AS: Away Team Shots
ggplot(aes(x = HS, y = AS), data = epldf) +
  geom_jitter(color = "#254117") +
  scale_x_continuous(breaks = seq(0, max(epldf$HS), 2)) +
  scale_y_continuous(breaks = seq(0, max(epldf$AS), 2)) +
  geom_smooth() +
  xlab('Home Team Shots') +
  ylab('Away Team Shots')
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggsave('homeawayshotsjitter.jpg')
## Saving 7 x 5 in image
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# Running the following test shows that the co-efficient is negative 
# The correlation coefficient is not strong though
cor.test(epldf$HS, epldf$AS, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  epldf$HS and epldf$AS
## t = -9.6058, df = 378, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5203667 -0.3583190
## sample estimates:
##        cor 
## -0.4429534

This bar plot shows the most number of goals scored at home by a team

# Set stat = "identity" to specify the y-axis
p1 <- ggplot(aes(x = HomeTeam, y = FTHG), data = epldf) + 
  geom_bar(stat = "identity", fill = "#6C2DC7") +
  xlab("Team") +
  ylab("Goals Scored at Home")
ggsave('hometeamscored.jpg')
## Saving 7 x 5 in image

This bar plot shows the most number of goals conceded at home by a team

# Set stat = "identity" to specify the y-axis
p2 <- ggplot(aes(x = HomeTeam, y = FTAG), data = epldf) + 
  geom_bar(stat = "identity", fill = "#6C2DC7") +
  xlab("Team") +
  ylab("Goals Conceded at Home")
ggsave('hometeamconceded.jpg')
## Saving 7 x 5 in image

This bar plot shows the most number of goals scored by a team away from home

# Set stat = "identity" to specify the y-axis
p3 <- ggplot(aes(x = AwayTeam, y = FTAG), data = epldf) + 
  geom_bar(stat = "identity", fill = "#6C2DC7") +
  xlab("Team") +
  ylab("Goals Scored Away from Home")
ggsave('hometeamscored.jpg')
## Saving 7 x 5 in image

Arrange the plots in a grid

# Arrange the plots
suppressMessages(library(gridExtra))
grid.arrange(p1, p2, p3)