# Load libraries (they need to be installed on the first run via install.packages)
# You do not need to use these libraries, though
library(RSQLite)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)
library(lubridate)

# connect to database
con <- dbConnect(SQLite(), dbname = "EuropeanSoccer.sqlite")
# table queries
match <- tibble::as_tibble(dbGetQuery(con, "SELECT * FROM Match"))
league <- tibble::as_tibble(dbGetQuery(con,"SELECT * FROM League"))

Project Overview

The European Soccer Database contains match data from major European football leagues. The objective of this project is to explore scoring patterns, league characteristics, home advantage effects, seasonal trends, and ball possession distributions using exploratory data analysis and data visualization techniques.

The analysis focuses on identifying meaningful insights from football match data and presenting them through statistical summaries and visualizations.

Analysis 1: Goal Scoring Patterns in Major European Leagues

# Create total goals column
match$total_goals <- match$home_team_goal + match$away_team_goal

# Merge match and league tables
merged_data <- match %>%
  left_join(league, by = c("league_id" = "id"))

# Filter top four leagues
top_leagues <- merged_data %>%
  filter(name %in% c(
    "Spain LIGA BBVA",
    "England Premier League",
    "Germany 1. Bundesliga",
    "Italy Serie A"
  ))

# Calculate average goals per game
avg_goals <- top_leagues %>%
  group_by(name) %>%
  summarise(avg_goals_per_game = mean(total_goals, na.rm = TRUE))

# Print results
avg_goals
## # A tibble: 4 × 2
##   name                   avg_goals_per_game
##   <chr>                               <dbl>
## 1 England Premier League               2.71
## 2 Germany 1. Bundesliga                2.90
## 3 Italy Serie A                        2.62
## 4 Spain LIGA BBVA                      2.77
# Bar chart
ggplot(avg_goals,
       aes(x = reorder(name, avg_goals_per_game),
           y = avg_goals_per_game,
           fill = name)) +

  geom_bar(stat = "identity") +

  labs(
    title = "Average Goals per Game in Top European Leagues",
    x = "League",
    y = "Average Goals per Game"
  ) +

  theme_minimal() +

  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14)
  ) 

The bar chart compares the average number of goals scored per game across the four major European football leagues. Germany’s Bundesliga had the highest average goals scored per game, while Italy’s Serie A had the lowest average. This indicates that Bundesliga matches were generally more attack-oriented and higher-scoring.

# Analysis 2: Comparison of Goal Statistics Between Major and Other European Leagues


``` r
# Create league groups
merged_data <- merged_data %>%
  mutate(
    league_group = ifelse(
      name %in% c(
        "Spain LIGA BBVA",
        "England Premier League",
        "Germany 1. Bundesliga",
        "Italy Serie A"
      ),
      "Top 4 Leagues",
      "Other Leagues"
    )
  )

# Calculate average and standard deviation
league_stats <- merged_data %>%
  group_by(league_group) %>%
  summarise(
    average_goals = mean(total_goals, na.rm = TRUE),
    standard_deviation = sd(total_goals, na.rm = TRUE)
  )

# Show table
print(league_stats)
## # A tibble: 2 × 3
##   league_group  average_goals standard_deviation
##   <chr>                 <dbl>              <dbl>
## 1 Other Leagues          2.68               1.65
## 2 Top 4 Leagues          2.74               1.69
# Convert to long format for plotting
league_stats_long <- league_stats %>%
  pivot_longer(
    cols = c(average_goals, standard_deviation),
    names_to = "metric",
    values_to = "value"
  )

# Create plot
p <- ggplot(league_stats_long,
            aes(x = league_group,
                y = value,
                fill = metric)) +

  geom_bar(stat = "identity",
           position = "dodge") +
  geom_text(aes(label = round(value, 2)),
          position = position_dodge(width = 0.9),
          vjust = -0.3)   +

  labs(
    title = "Comparison of Goal Statistics",
    x = "League Group",
    y = "Value"
  ) +

  theme_minimal()

print(p)

The comparison shows differences between the four major European leagues and the remaining leagues in terms of average goals and variability. The standard deviation measures how spread out the goals per match are. Higher standard deviation indicates greater unpredictability in match outcomes.

Analysis 3: Investigating Home Advantage in European Football

# Create boxplot for home and away goals

boxplot_data <- data.frame(
  goals = c(match$home_team_goal, match$away_team_goal),
  team_type = c(
    rep("Home Team", length(match$home_team_goal)),
    rep("Away Team", length(match$away_team_goal))
  )
)

# Plot
p <- ggplot(boxplot_data,
            aes(x = team_type,
                y = goals,
                fill = team_type)) +

  geom_boxplot() +

  labs(
    title = "Home vs Away Goals Distribution",
    x = "Team Type",
    y = "Goals Scored"
  ) +

  theme_minimal() +

  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14)
  )

# Show plot
print(p)

Yes, Home teams have a slightly higher median number of goals, which suggests the presence of a home advantage.The boxplot compares the distribution of goals scored by home and away teams. The visualization also highlights the spread and outliers in goal scoring.

Analysis 5: Distribution Analysis of Home Team Possession

# Create possession variable and remove missing values
possession <- na.omit(match$home_team_possession)

# Density Plot
p1 <- ggplot(data.frame(possession),
             aes(x = possession)) +

  geom_density(fill = "lightblue", alpha = 0.5) +

  labs(
    title = "Density Plot of Home Team Possession",
    x = "Home Team Possession",
    y = "Density"
  ) +

  theme_minimal()

# Show density plot
print(p1)

# QQ Plot
p2 <- ggplot(data.frame(possession),
             aes(sample = possession)) +

  stat_qq() +

  stat_qq_line(color = "red") +

  labs(
    title = "QQ Plot of Home Team Possession"
  ) +

  theme_minimal()

# Show QQ plot
print(p2)

The density plot and QQ plot are used to visually assess whether the home_team_possession variable follows an approximately normal distribution. The density curve shows the shape of the distribution, while the QQ plot compares the observed data with a theoretical normal distribution. Small deviations from the straight line indicate that the distribution is only approximately normal.


Conclusion

This project explored various aspects of European football match data through exploratory data analysis and visualization techniques.

Key findings include:

The project demonstrates how data visualization can be used to identify patterns, compare groups, and support data-driven conclusions in sports analytics.