con <- dbConnect(SQLite(), "f1_data.sqlite")
query1 <- "
SELECT ra.year, ra.round, d.driverRef, r.points
FROM results r
JOIN races ra ON r.raceId = ra.raceId
JOIN drivers d ON r.driverId = d.driverId
WHERE ra.year >= 2018;
"
fig_dat1 <- dbGetQuery(con, query1) %>% as_tibble()
seasons_to_plot <- c(2021, 2022)
driver_points <- fig_dat1 %>%
filter(year %in% seasons_to_plot) %>%
group_by(year, driverRef) %>%
summarise(total_points = sum(points), .groups = "drop") %>%
group_by(year) %>%
slice_max(total_points, n = 5) %>%
inner_join(fig_dat1, by = c("year", "driverRef")) %>%
group_by(year, driverRef) %>%
arrange(round) %>%
mutate(cum_points = cumsum(points))
ggplot(driver_points, aes(x = round, y = cum_points, color = driverRef, group = driverRef)) +
geom_line(size = 1.1) +
facet_wrap(~year, ncol = 2, scales = "free_x") +
labs(title = "Top 5 Drivers – Two Seasons Championship Battle",
x = "Race Round", y = "Cumulative Points", color = "Driver") +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

query2 <- "
SELECT c.name AS constructor, ra.year
FROM results r
JOIN races ra ON r.raceId = ra.raceId
JOIN constructors c ON r.constructorId = c.constructorId
WHERE r.position = 1;
"
fig_dat2 <- dbGetQuery(con, query2) %>%
as_tibble() %>%
mutate(decade = floor(year/10)*10) %>%
group_by(constructor, decade) %>%
summarise(total_wins = n(), .groups = 'drop') %>%
group_by(constructor) %>%
summarise(all_time_wins = sum(total_wins), .groups = "drop") %>%
slice_max(all_time_wins, n = 5) %>% # pick top 5 all-time constructors
inner_join(
dbGetQuery(con, query2) %>%
as_tibble() %>%
mutate(decade = floor(year/10)*10) %>%
group_by(constructor, decade) %>%
summarise(total_wins = n(), .groups = 'drop'),
by = "constructor"
)
ggplot(fig_dat2, aes(x = factor(decade), y = total_wins, fill = constructor)) +
geom_col(position = "dodge") +
labs(title = "Top 5 Constructors by Decade",
x = "Decade", y = "Total Wins", fill = "Constructor") +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")

query3 <- "
SELECT d.driverRef, lt.lap, lt.milliseconds/60000.0 AS lap_time_min
FROM lap_times lt
JOIN drivers d ON lt.driverId = d.driverId
JOIN races r ON lt.raceId = r.raceId
WHERE r.year = 2021 AND r.name = 'Italian Grand Prix';
"
fig_dat3 <- dbGetQuery(con, query3) %>%
as_tibble()
# Get top 3 drivers by number of laps completed
top3_drivers <- fig_dat3 %>%
count(driverRef, sort = TRUE) %>%
slice_head(n = 3) %>%
pull(driverRef)
fig_dat3_top3 <- fig_dat3 %>%
filter(driverRef %in% top3_drivers)
# Violin + boxplot plot
ggplot(fig_dat3_top3, aes(x = driverRef, y = lap_time_min, fill = driverRef)) +
geom_violin(alpha = 0.6, trim = FALSE) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
labs(
title = "Lap Time Distribution (minutes) – Top 3 Drivers, Italian GP 2021",
x = "Driver",
y = "Lap Time (minutes)",
fill = "Driver"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")

query4 <- "
SELECT d.driverRef, ps.lap, ps.milliseconds/1000.0 AS pit_sec
FROM pit_stops ps
JOIN drivers d ON ps.driverId = d.driverId
JOIN races r ON ps.raceId = r.raceId
WHERE r.year = 2021 AND r.name = 'Italian Grand Prix';
"
fig_dat4 <- dbGetQuery(con, query4) %>% as_tibble()
ggplot(fig_dat4, aes(x = lap, y = driverRef, fill = pit_sec)) +
geom_tile(color = "white") +
scale_fill_viridis_c(option = "C") +
labs(title = "Pit Stop Times Heatmap – Italian GP 2021",
x = "Lap", y = "Driver", fill = "Pit Stop (s)",
caption = "Source: Ergast F1 Database") +
theme_minimal(base_size = 14)

query5 <- "
SELECT r.position AS finish_pos, q.position AS quali_pos, d.surname, ra.year, ra.name AS race
FROM results r
JOIN qualifying q ON r.raceId = q.raceId AND r.driverId = q.driverId
JOIN drivers d ON r.driverId = d.driverId
JOIN races ra ON r.raceId = ra.raceId
WHERE ra.year = 2021;
"
fig_dat5 <- dbGetQuery(con, query5) %>% as_tibble()
fig_dat5 <- dbGetQuery(con, query5) %>% as_tibble()
plot_ly(
data = fig_dat5,
x = ~quali_pos,
y = ~finish_pos,
text = ~paste("Driver:", surname, "<br>Race:", race),
color = ~surname,
type = "scatter",
mode = "markers"
) %>%
plotly::layout(
title = list(text = "Qualifying vs Finishing Position (2021)"),
xaxis = list(title = "Qualifying Position"),
yaxis = list(title = "Finishing Position", autorange = "reversed") # invert so P1 is top
)
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
query6 <- "
SELECT d.driverRef, lt.milliseconds/1000.0 AS lap_time_sec, r.year
FROM lap_times lt
JOIN drivers d ON lt.driverId = d.driverId
JOIN races r ON lt.raceId = r.raceId
WHERE r.year = 2021;
"
fig_dat6 <- dbGetQuery(con, query6) %>% as_tibble()
ggplot(fig_dat6, aes(x = lap_time_sec, y = driverRef, fill = driverRef)) +
geom_density_ridges(alpha = 0.6, scale = 1.5, rel_min_height = 0.01) +
labs(title = "Lap Time Density – Drivers (2021 Season)",
x = "Lap Time (seconds)", y = "Driver",
caption = "Source: Ergast F1 Database") +
theme_minimal(base_size = 14) +
theme(legend.position = "none")
## Picking joint bandwidth of 3.19

champ_points <- fig_dat1 %>%
filter(year == 2021) %>%
group_by(driverRef) %>%
arrange(round) %>%
mutate(cum_points = cumsum(points))
ggplot(champ_points, aes(x = round, y = cum_points, color = driverRef, group = driverRef)) +
geom_line(size = 1.1) +
labs(title = "Championship Points Race – 2021",
subtitle = "Round: {frame_along}",
x = "Round", y = "Cumulative Points", color = "Driver",
caption = "Source: Ergast F1 Database") +
theme_minimal(base_size = 14) +
transition_reveal(round)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

query8 <- "
SELECT ra.name AS race, ra.year, SUM(r.time) AS race_duration
FROM results r
JOIN races ra ON r.raceId = ra.raceId
WHERE ra.year BETWEEN 2019 AND 2021
GROUP BY ra.raceId;
"
fig_dat8 <- dbGetQuery(con, query8) %>% as_tibble()
ggplot(fig_dat8, aes(x = reorder(race, race_duration), y = race_duration/60000)) +
geom_boxplot(fill = "skyblue", alpha = 0.7) +
coord_flip() +
labs(title = "Race Duration Distribution by Circuit (2019–2021)",
x = "Circuit", y = "Duration (minutes)",
caption = "Source: Ergast F1 Database") +
theme_minimal(base_size = 14)
