# Load libraries (they need to be installed on the first run via install.packages)
# You do not need to use these libraries, though
library(RSQLite)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)
library(lubridate)
# connect to database
con <- dbConnect(SQLite(), dbname = "EuropeanSoccer.sqlite")
# table queries
match <- tibble::as_tibble(dbGetQuery(con, "SELECT * FROM Match"))
league <- tibble::as_tibble(dbGetQuery(con,"SELECT * FROM League"))
The European Soccer Database contains match data from major European football leagues. The objective of this project is to explore scoring patterns, league characteristics, home advantage effects, seasonal trends, and ball possession distributions using exploratory data analysis and data visualization techniques.
The analysis focuses on identifying meaningful insights from football match data and presenting them through statistical summaries and visualizations.
# Create total goals column
match$total_goals <- match$home_team_goal + match$away_team_goal
# Merge match and league tables
merged_data <- match %>%
left_join(league, by = c("league_id" = "id"))
# Filter top four leagues
top_leagues <- merged_data %>%
filter(name %in% c(
"Spain LIGA BBVA",
"England Premier League",
"Germany 1. Bundesliga",
"Italy Serie A"
))
# Calculate average goals per game
avg_goals <- top_leagues %>%
group_by(name) %>%
summarise(avg_goals_per_game = mean(total_goals, na.rm = TRUE))
# Print results
avg_goals
## # A tibble: 4 × 2
## name avg_goals_per_game
## <chr> <dbl>
## 1 England Premier League 2.71
## 2 Germany 1. Bundesliga 2.90
## 3 Italy Serie A 2.62
## 4 Spain LIGA BBVA 2.77
# Bar chart
ggplot(avg_goals,
aes(x = reorder(name, avg_goals_per_game),
y = avg_goals_per_game,
fill = name)) +
geom_bar(stat = "identity") +
labs(
title = "Average Goals per Game in Top European Leagues",
x = "League",
y = "Average Goals per Game"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14)
)
The bar chart compares the average number of goals scored per game
across the four major European football leagues. Germany’s Bundesliga
had the highest average goals scored per game, while Italy’s Serie A had
the lowest average. This indicates that Bundesliga matches were
generally more attack-oriented and higher-scoring.
# Analysis 2: Comparison of Goal Statistics Between Major and Other European Leagues
``` r
# Create league groups
merged_data <- merged_data %>%
mutate(
league_group = ifelse(
name %in% c(
"Spain LIGA BBVA",
"England Premier League",
"Germany 1. Bundesliga",
"Italy Serie A"
),
"Top 4 Leagues",
"Other Leagues"
)
)
# Calculate average and standard deviation
league_stats <- merged_data %>%
group_by(league_group) %>%
summarise(
average_goals = mean(total_goals, na.rm = TRUE),
standard_deviation = sd(total_goals, na.rm = TRUE)
)
# Show table
print(league_stats)
## # A tibble: 2 × 3
## league_group average_goals standard_deviation
## <chr> <dbl> <dbl>
## 1 Other Leagues 2.68 1.65
## 2 Top 4 Leagues 2.74 1.69
# Convert to long format for plotting
league_stats_long <- league_stats %>%
pivot_longer(
cols = c(average_goals, standard_deviation),
names_to = "metric",
values_to = "value"
)
# Create plot
p <- ggplot(league_stats_long,
aes(x = league_group,
y = value,
fill = metric)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = round(value, 2)),
position = position_dodge(width = 0.9),
vjust = -0.3) +
labs(
title = "Comparison of Goal Statistics",
x = "League Group",
y = "Value"
) +
theme_minimal()
print(p)
The comparison shows differences between the four major European leagues and the remaining leagues in terms of average goals and variability. The standard deviation measures how spread out the goals per match are. Higher standard deviation indicates greater unpredictability in match outcomes.
# Create boxplot for home and away goals
boxplot_data <- data.frame(
goals = c(match$home_team_goal, match$away_team_goal),
team_type = c(
rep("Home Team", length(match$home_team_goal)),
rep("Away Team", length(match$away_team_goal))
)
)
# Plot
p <- ggplot(boxplot_data,
aes(x = team_type,
y = goals,
fill = team_type)) +
geom_boxplot() +
labs(
title = "Home vs Away Goals Distribution",
x = "Team Type",
y = "Goals Scored"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14)
)
# Show plot
print(p)
Yes, Home teams have a slightly higher median number of goals, which
suggests the presence of a home advantage.The boxplot compares the
distribution of goals scored by home and away teams. The visualization
also highlights the spread and outliers in goal scoring.
# Convert date column to date format
match$date <- as.Date(match$date)
# Extract month
match$month <- month(match$date, label = TRUE)
# Create total goals column
match$total_goals <- match$home_team_goal + match$away_team_goal
# Calculate average goals per month
monthly_goals <- match %>%
group_by(month) %>%
summarise(
average_goals = mean(total_goals, na.rm = TRUE)
)
# Create line chart
p <- ggplot(monthly_goals,
aes(x = month,
y = average_goals,
group = 1)) +
geom_line(color = "blue", linewidth = 1.2) +
geom_point(size = 3, color = "red") +
geom_text(aes(label = round(average_goals, 2)),
vjust = -0.5,
size = 3) +
labs(
title = "Average Goals per Match by Month",
x = "Month",
y = "Average Goals"
) +
theme_minimal()
# Show plot
print(p)
The line chart shows the variation in average goals scored per match
across different months of the year. Some summer months show slightly
higher average goals, which may support the idea that weather conditions
can influence gameplay and scoring patterns.
# Create possession variable and remove missing values
possession <- na.omit(match$home_team_possession)
# Density Plot
p1 <- ggplot(data.frame(possession),
aes(x = possession)) +
geom_density(fill = "lightblue", alpha = 0.5) +
labs(
title = "Density Plot of Home Team Possession",
x = "Home Team Possession",
y = "Density"
) +
theme_minimal()
# Show density plot
print(p1)
# QQ Plot
p2 <- ggplot(data.frame(possession),
aes(sample = possession)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(
title = "QQ Plot of Home Team Possession"
) +
theme_minimal()
# Show QQ plot
print(p2)
The density plot and QQ plot are used to visually assess whether the
home_team_possession variable follows an approximately normal
distribution. The density curve shows the shape of the distribution,
while the QQ plot compares the observed data with a theoretical normal
distribution. Small deviations from the straight line indicate that the
distribution is only approximately normal.
This project explored various aspects of European football match data through exploratory data analysis and visualization techniques.
Key findings include:
The project demonstrates how data visualization can be used to identify patterns, compare groups, and support data-driven conclusions in sports analytics.