#chunk of cleaning my data
#read the CSV of premier league matches
matches <- read.csv("~/Desktop/PremierLeagueMatches.csv")


#trims the last 90 rows of NAs
matches <- head(matches, -90)

# convert Date properly
matches$Date <- as.Date(matches$Date, format = "%m/%d/%y")

#converts Date, Time, Scores, Predicts, Attendence to workable numeric
matches <- matches %>%
  mutate(
    #makes date and time easier to work w
    Time = hm(Time),                          
    homeScore = as.numeric(homeScore),
    awayScore = as.numeric(awayScore),
    homeXG = as.numeric(homeXG),
    awayXG = as.numeric(awayXG),
    # remove commas
    Attendance = as.numeric(str_remove(Attendance, ","))
  )



#adds season names column, helpful when comparing across time
season_names <- c("Season 1","Season 2","Season 3")  # adjust as needed
matches <- matches %>%
  mutate(Season = case_when(
    Date >= as.Date("2022-08-05") & Date <= as.Date("2023-05-28") ~ 1,
    Date >= as.Date("2023-08-11") & Date <= as.Date("2024-05-19") ~ 2,
    Date >= as.Date("2024-08-11") & Date <= as.Date("2025-05-31") ~ 3,
  ))

#adds a column for away error and home error for each team
matches <- matches %>%
  mutate(
    HError = (homeScore - homeXG),
    AError = (awayScore - awayXG)
  )


# Define relegated teams by season
relegated_list <- list(
  "Season 1" = c("Leeds United", "Leicester City", "Southampton"),
  "Season 2" = c("Burnley", "Luton Town", "Sheffield Utd"),
  "Season 3" = c("Southampton", "Leicester City", "Ipswich Town")
)

# Add columns for home and away relegation status
matches <- matches %>%
  mutate(
    HomeRelegatedS = case_when(
      (Season == 1 & Home.Team %in% relegated_list[["Season 1"]]) |
      (Season == 2 & Home.Team %in% relegated_list[["Season 2"]]) |
      (Season == 3 & Home.Team %in% relegated_list[["Season 3"]]) ~ TRUE,
      TRUE ~ FALSE
    ),
    AwayRelegatedS = case_when(
      (Season == 1 & Away.Team %in% relegated_list[["Season 1"]]) |
      (Season == 2 & Away.Team %in% relegated_list[["Season 2"]]) |
      (Season == 3 & Away.Team %in% relegated_list[["Season 3"]]) ~ TRUE,
      TRUE ~ FALSE
    )
)

#computes total Match Error
matches <- matches %>%
  mutate(
    TotalMatchError = abs(HError) + abs(AError)
  )

write.csv(matches, "~/Desktop/matches.csv", row.names = FALSE)

##Output correlation of XG v. Home Score

#Outputs correlation v. homescore  
corrH <- cor(matches$homeScore, matches$homeXG, use = "complete.obs")
corrA <- cor(matches$awayScore, matches$awayXG, use = "complete.obs")

corrH
## [1] 0.5783566
corrA
## [1] 0.6121344

# creates Team Error csv, of each team separated by their home and away avg E
# used in tableau
#computes home average error
home_avg <- matches %>%
  group_by(Team = Home.Team) %>%
  summarise(Home = mean(HError, na.rm = TRUE), .groups = "drop")
#computes away average
away_avg <- matches %>%
  group_by(Team = Away.Team) %>%
  summarise(Away = mean(AError, na.rm = TRUE), .groups = "drop")
#combines the average away and home error
team_error <- full_join(home_avg, away_avg, by = "Team") %>%
  mutate(overall_mean = (Home + Away) / 2)
#exports this to use in tableau
write.csv(team_error, "~/Desktop/teamErrorR.csv", row.names = FALSE)

Tableau Introduction Map

# heatmap
ggplot(matches, aes(x = AError, y = HError)) +
  geom_bin2d(bins = 30) +
  scale_fill_gradient(low = "darkblue", high = "darkorange", name = "Match Count") +
  labs(
    title = "Distribution of Home vs Away Prediction Errors",
    x = "Away Error",
    y = "Home Error"
  ) +
  # point at the origin: no aes() for constants
  geom_point(x = 0, y = 0, shape = 4, color = "white", size = 4, stroke = 1) +
  theme_minimal(base_size = 12)

#plot of refferees
#used plotly website for references 
plot_ly(
  data = matches,
  x = ~Referee,
  y = ~TotalMatchError,
  type = "box",
  boxpoints = "outliers",
  marker = list(color = "darkblue"),
  line = list(color = "gray30"),
  fillcolor = "lightblue"
) %>%
  layout(
    title = "Total Prediction Error by Referee",
    xaxis = list(
      #makes the angled x axis, to be more readable
      title = "Referee",
      tickangle = 65
    ),
    yaxis = list(title = "Total Error Per Match"),
    #adds white space at the bottom so that 
    margin = list(b = 150),
    font = list(size = 12)
  )
##https://plotly.com/r/axes/

#Plotly about attendence

#had to take out the NAs, one NA in attn that was breaking code
attn <- matches %>% 
  filter(!is.na(Attendance), !is.na(TotalMatchError))

#creates the linear regression line, to show lack of relationship even more
fit <- lm(TotalMatchError ~ Attendance, data = attn)

#plotly scatter of attendance v. total match error
plot_ly(attn, x = ~Attendance, y = ~TotalMatchError, type = "scatter", 
        mode = "markers", marker = list(color = "navy", opacity = 0.8, size = 6)
        ) %>%
  #adds in the fitted regression that was made above
  #have to add in attn$Attendance, because plotly cannot read it otherwise
  add_lines(x = attn$Attendance,
            y = fitted(fit),
            line = list(color = "darkorange", opacity = 1, width = 4),
            ) %>%
  layout(
    title = "Attendance vs Total Match Error",
    xaxis = list(title = "Attendance"),
    yaxis = list(title = "Total Match Error"),
    font = list(size = 12),
    showlegend = FALSE
  )
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...

animate TO HIGHLIGHT BOTTOM 3 and TOP 3 errors

#creates barbell plot of home v. away prediction error by team 
ggplot(team_error, aes(y = reorder(Team, overall_mean))) +
  #draws center gray segment
  geom_segment(aes(x = Away, xend = Home, yend = Team),
               color = "gray70", linewidth = 1.2) +
  #draws home and away points 
  geom_point(aes(x = Home, color = "Home"), size = 3) +
  geom_point(aes(x = Away, color = "Away"), size = 3) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  scale_color_manual(name = "Game Type",values = c("Home" = "darkblue",
               "Away" = "lightblue")) +
  theme_minimal(base_size = 12) +
  labs(
    title = "Home vs Away xG Prediction Error by Team",
    subtitle = "Error = Goals − xG",
    x = "Average Prediction Error (Goals − xG)",
    y = "Team"
  ) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.y = element_text(size = 8)
  )

#https://r-graph-gallery.com/web-extended-dumbbell-plot-ggplot2.html

SHINY APP FOR AWAY

#change the colors here to be 
highlight_teams <- c("Arsenal", "Manchester City", "Tottenham",
                     "Leeds United", "Crystal Palace", "Leicester City")

team_error_filtered <- team_error %>% 
  filter(Team %in% highlight_teams)

ggplot(team_error_filtered, aes(y = reorder(Team, overall_mean))) +
  geom_segment(aes(x = Away, xend = Home, yend = Team),
               color = "gray70", linewidth = 1.2) +
  geom_point(aes(x = Home, color = "Home"), size = 3) +
  geom_point(aes(x = Away, color = "Away"), size = 3) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  scale_color_manual(values = c("Home" = "darkblue", "Away" = "lightblue")) +
  theme_minimal(base_size = 12) +
  labs(
    title = "Home vs Away xG Prediction Error (Highlighted Teams Only)",
    subtitle = "Error = Goals − xG",
    x = "Average Prediction Error (Goals − xG)",
    y = "Team"
  ) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.y = element_text(size = 8)
  )           

#sep out the teams by highest and lowest error (hard v. easy to predict)
top_teams <- c("Arsenal", "Manchester City", "Tottenham")
bottom_teams <- c("Leeds United", "Crystal Palace", "Leicester City")
team_order <- c(top_teams, bottom_teams)

#makes a DF of just the 6 teams above of all their home games
#to build the home side of the plot
home_df <- matches %>%
  filter(Home.Team %in% team_order) %>%
  mutate(
    Team = factor(Home.Team, levels = team_order),
    ErrorValue = HError,
    #adds a column either H or E to predict, for coloring in the actual plot  
    Group = case_when(
      Home.Team %in% top_teams ~ "Hardest to Predict",
      Home.Team %in% bottom_teams ~ "Easiest to Predict"
    )
  )

#draws the home violin plot
p_home <- ggplot(home_df, aes(x = Team, y = ErrorValue, fill = Group)) +
  geom_violin(color = "black") +
  geom_boxplot(width = 0.15, outlier.shape = 1) +
  labs(
    title = "Home v. Away xG Prediction Errors",
    x = "Team",
    y = "Home Error (Goals − xG)"
  ) +
  scale_fill_manual(values = c(
      "Hardest to Predict" = "#3964DB",  
      "Easiest to Predict" = "darkorange"
    )) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
  legend.position = "none")

#makes a DF of just the 6 teams above of all their away games
#to build the home side of the plot
away_df <- matches %>%
  filter(Away.Team %in% team_order) %>%
  mutate(
    Team = factor(Away.Team, levels = team_order),
    ErrorValue = AError,
    #adds a column either H or E to predict, for coloring in the actual plot  
    Group = case_when(
      Away.Team %in% top_teams ~ "Hardest to Predict",
      Away.Team %in% bottom_teams ~ "Easiest to Predict"
    )
  )
  
#draws the away violin plot
p_away <- ggplot(away_df, aes(x = Team, y = ErrorValue, fill = Group)) +
  geom_violin(color = "black") +
  geom_boxplot(width = 0.15, outlier.shape = 1) +
  labs(
    x = "Team",
    y = "Away Error (Goals − xG)"
  ) +
    scale_fill_manual(values = c(
      "Hardest to Predict" = "#3964DB",  
      "Easiest to Predict" = "darkorange"
    )) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#combines both using patchwork
p_home + p_away

Their medians are quite similar, what might be causing them to be more volitile

#creates a small multiples graph of the goals v. prediction error for each team
#for HOME games 
ggplot(home_df, aes(x = homeScore, y = HError, color = Group)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  scale_color_manual(
    values = c(
      "Hardest to Predict" = "darkblue",
      "Easiest to Predict" = "darkorange"
    )
  ) +
  #facet wrap makes it draw based on the home team, separating it
  facet_wrap(~ Home.Team, ncol = 3) +
  labs(
    title = "Relationship Between Goals Scored and Prediction Error (Home)",
    x = "Home Goals Scored",
    y = "Home Error",
    color = "Team Group"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.background = element_rect(fill = "white", color = NA),
    )
## `geom_smooth()` using formula = 'y ~ x'

Tableau Animation

```

SHINY APP 2- Interactive

Now that we know that number of goals are the impactful piece, lets understand that more via interactive shiny apps

Which teams are unpredictable

What makes matches unpredictable