longest_racing_lap <- lap_times %>%
left_join(races, by = c("raceId" = "raceId")) %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
mutate(Longest_Lap = lap_times$time) %>%
select(forename,surname,nationality,name,date,Longest_Lap)%>%
arrange(desc(Longest_Lap)) %>%
head(10)
all.equal(
longest_racing_lap,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, name, date, lap_times.time AS Longest_Lap
FROM lap_times
LEFT JOIN races
ON lap_times.raceId = races.raceId
LEFT JOIN drivers
ON lap_times.driverId = drivers.driverId
ORDER BY Longest_Lap DESC
LIMIT 10"
),
check.attributes=F
)
[1] TRUE
As the query illustrates, there have been 0 separate occasions where single laps have taken over \(9\) minutes. Similarly, 0 laps in F1 history have taken between \(8\) and \(9\) minutes. There was no doubt a crash or some sort of reliability issue that precipitated each of these lengthy lap times.
format.dt.f(longest_racing_lap)
The query below should use the results table instead of the lapTimes table in order to avoid including accidents/retirements (contained in the lapTimes table). Accidents/retirements are not complete laps and lead to artifically short laptimes.
shortest_racing_lap <- results %>%
left_join(races, by = c("raceId" = "raceId")) %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
mutate(Shortest_Lap=results$fastestLapTime) %>%
filter(!is.null(Shortest_Lap)) %>%
select(forename, surname, nationality, name, date, Shortest_Lap) %>%
arrange(Shortest_Lap) %>%
head(10)
all.equal(
shortest_racing_lap,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, name, date, fastestLapTime as Shortest_Lap
FROM results
LEFT JOIN races
ON results.raceId = races.raceId
LEFT JOIN drivers
ON results.driverId = drivers.driverId
WHERE fastestLapTime IS NOT NULL
ORDER BY Shortest_Lap
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(shortest_racing_lap)
From the above, it is clear that the Sakhir Grand Prix circuit is the shortest F1 circuit in F1 history.
It is worth mentioning here that the clause ‘WHERE fastestLapTime IS NOT NULL’ should be included to remove all of the race results before lap times were officially recorded in the ergast database (which only started in \(2004\)).
Note that laptime data is technically only available for \(30\) of the \(72\) circuits included in the circuits
table. This is due to the fact that many of these circuits only hosted
Grands Prix in the \(1950s\), \(1960s\), \(1970s\), etc. - before lap times were
recorded. The query below joins \(4\)
tables (results
, race
, circuits
,
and drivers
) in order to return all of the information of
interest.
circuit_fastest_lap <- results %>%
left_join(races, by = c("raceId" = "raceId")) %>%
left_join(circuits, by = c("circuitId" = "circuitId")) %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
filter(!is.na(fastestLapTime)) %>%
mutate(Fastest_Lap=fastestLapTime, Circuit=name.y,Race=name.x) %>%
select(forename, surname, nationality, Fastest_Lap, Circuit, Race, year) %>%
group_by(Circuit) %>%
arrange(Fastest_Lap, by_group=TRUE) %>%
head(10)
all.equal(
circuit_fastest_lap,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, MIN(fastestLapTime) AS Fastest_Lap, circuits.name AS Circuit,
races.name AS Race, year
FROM results
LEFT JOIN races
ON results.raceId = races.raceId
LEFT JOIN circuits
ON races.circuitId = circuits.circuitId
LEFT JOIN drivers
ON results.driverId = drivers.driverId
WHERE fastestLapTime IS NOT NULL
GROUP BY forename, surname, nationality, circuits.name, races.name, year
ORDER BY Fastest_Lap
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(circuit_fastest_lap)
These are the most prolific drivers in F1. They did not necessarily win the most, but they did win enough in order to have an F1 seat for a long time.
greatest_num_race_entries <- results %>%
left_join(drivers, by="driverId") %>%
group_by(driverId,forename,surname,nationality) %>%
summarise(Total_Entries=n_distinct(raceId), .groups='drop') %>%
#.groups='drop': All levels of grouping are dropped
# summarise() returns the grouping var as first column
ungroup() %>% #thus, have to ungroup() then deselect() the 'driverId' column
select(forename,surname,nationality, Total_Entries) %>%
slice_max(Total_Entries,n=10,with_ties = F) #replaces arrange(desc(Total_Entries)) %>% head(10)
all.equal(
greatest_num_race_entries,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, COUNT(DISTINCT results.raceID) AS Total_Entries
FROM results
LEFT JOIN drivers
ON results.driverId = drivers.driverId
GROUP BY forename, surname, nationality
ORDER BY Total_Entries DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(greatest_num_race_entries)
Calculating Age of first race by year, ignoring whether the driver’s birthday has passed.
# THANKS TO SHARMAINE
youngest_driver_to_start1 <- results %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
group_by(driverId) %>%
slice_min(year(as.POSIXct(date,format="%Y-%m-%d")), n=1, with_ties = F) %>%
mutate(Age_At_First_Race=year(as.POSIXct(date,format="%Y-%m-%d"))-year(as.POSIXct(dob,format="%Y-%m-%d"))) %>%
ungroup() %>%
select(forename,surname,nationality, Age_At_First_Race, name,year,constructorId) %>%
slice_min(Age_At_First_Race, n=10, with_ties = F)
all.equal(
youngest_driver_to_start1,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
MIN((date) - (dob)) AS age_at_first_race, name, year, constructorId
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
AND dob IS NOT NULL
LEFT JOIN races
ON results.raceId = races.raceId
GROUP BY drivers.driverId
ORDER BY age_at_first_race
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
MIN((date) - (dob))
is certainly not the only way to
perform mathematical operations on dates - and having
age_at_first_race
in terms of days is probably not ideal -
but it is syntactically simple and allows one to avoid
manually-specifying a date/time format.We can also use the SQLite
function MIN(julianday(date) - julianday(dob))
to parse the time strings date
(from races table) and dob (from drivers table).
julianday
allows mathematical operations (in this case,
subtraction) to be performed on dates. For example, calculating Age of
first race by days, dividing it by \(365\) then rounding it to nearest int
youngest_driver_to_start2 <- results %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
mutate(
date_d = ymd(date),
dob_d = ymd(dob),
# age_at_first_race = round(as.numeric((date_d - dob_d), units = "days")/365, 0)
# age_at_first_race = round((date_d - dob_d)/365, 0)
age_at_first_race = round(as.numeric(date_d - dob_d)/365, 0)
# age_at_first_race = as.numeric(date_d - dob_d)/365
) %>%
# slice_min(age_at_first_race, n = 10, with_ties = F) %>%
group_by(driverId) %>%
slice(which.min(age_at_first_race)) %>%
ungroup() %>%
select(forename, surname, nationality, age_at_first_race, name, year, constructorId) %>%
# arrange(age_at_first_race) %>% head(10) %>%
slice_min(age_at_first_race, n = 10, with_ties = F)
all.equal(
youngest_driver_to_start2,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
MIN(ROUND((julianday(date) - julianday(dob))/365)) AS age_at_first_race, name, year, constructorId
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
AND dob IS NOT NULL
LEFT JOIN races
ON results.raceId = races.raceId
GROUP BY drivers.driverId
ORDER BY age_at_first_race
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(youngest_driver_to_start1)
For those keeping score, Thomas Monarch was 18 years when he made his first start for de_tomaso-osca at the 1963 Mexican Grand Prix. He unfortunately retired from the race on lap 32 with engine issues.
A similar query to the one above, only with a slight modification to the ORDER BY clause and the aggregation function (MIN becomes MAX):
Calculating Age of first race by year, ignoring whether the driver’s birthday has passed.
# THANKS TO SHARMAINE
oldest_driver_to_start1 <- results %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
group_by(driverId) %>%
slice_min(year(as.POSIXct(date,format="%Y-%m-%d")), n=1, with_ties = F) %>% #starting year
mutate(Age_At_First_Race=year(as.POSIXct(date,format="%Y-%m-%d"))-year(as.POSIXct(dob,format="%Y-%m-%d"))) %>% #calculate age at first race
ungroup() %>%
select(forename,surname,nationality, Age_At_First_Race, name,year) %>%
slice_max(Age_At_First_Race, n=10, with_ties = F) #top 10 oldest drivers at first race
all.equal(
oldest_driver_to_start1,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
MIN((date) - (dob)) AS Age_At_First_Race, name, year
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
AND dob IS NOT NULL
LEFT JOIN races
ON results.raceId = races.raceId
GROUP BY drivers.driverId
ORDER BY Age_At_First_Race DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
Calculating Age of first race by days, dividing it by \(365\) then rounding it to nearest int
oldest_driver_to_start2 <- results %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
mutate(
date_d = ymd(date),
dob_d = ymd(dob),
# age_at_first_race = round(as.numeric((date_d - dob_d), units = "days")/365, 0)
# age_at_first_race = round((date_d - dob_d)/365, 0)
age_at_first_race = round(as.numeric(date_d - dob_d)/365, 0)
# age_at_first_race = as.numeric(date_d - dob_d)/365
) %>%
# slice_min(age_at_first_race, n = 10, with_ties = F) %>%
group_by(driverId) %>%
slice(which.min(age_at_first_race)) %>%
ungroup() %>%
select(forename, surname, nationality, age_at_first_race, name, year) %>%
# arrange(age_at_first_race) %>% head(10) %>%
slice_max(age_at_first_race, n = 10, with_ties = F)
#edits based on prof's original SQL query & suggestions on using julianday()
all.equal(
oldest_driver_to_start2,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
MIN(ROUND((julianday(date) - julianday(dob))/365)) AS Age_At_First_Race, name, year
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
AND dob IS NOT NULL
LEFT JOIN races
ON results.raceId = races.raceId
GROUP BY drivers.driverId
ORDER BY Age_At_First_Race DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(oldest_driver_to_start2)
Arthur Legat was 54 years when he last entered an F1 race - the 1952 Belgian Grand Prix.
The following query filters only winning results (WHERE position = 1) for each driver (GROUP BY drivers.driverId, forename, surname, nationality).
This makes it easy to count the number of winning races - simply take SUM(position) - since a single win is coded as a \(1\).
most_wins_all_time=results %>%
# filter(!is.na(position)) %>%
mutate(position=as.integer(position)) %>%
filter(position == 1) %>%
group_by(driverId) %>%
summarise(race_wins = sum(position)) %>%
left_join(drivers, "driverId") %>%
select(forename, surname, nationality, race_wins) %>%
slice_max(race_wins, n = 10, with_ties = F)
all.equal(
most_wins_all_time,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, SUM(position) AS Wins
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
WHERE position = 1
GROUP BY drivers.driverId, forename, surname, nationality
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(most_wins_all_time)
We can provide bit more context to these results by also including the number of races a driver entered over their career. We do this be removing the WHERE clause and adding a CASE statement and another aggregating function (COUNT(Distinct)).
race_wins_entries=results %>% # race_wins_entries
mutate(position=as.integer(position)) %>%
group_by(driverId) %>%
summarise(race_wins = sum(position == 1, na.rm = TRUE),
entries = n_distinct(raceId)) %>%
left_join(drivers, "driverId") %>%
select(forename, surname, nationality, race_wins, entries) %>%
slice_max(race_wins, n = 10, with_ties = F)
all.equal(
race_wins_entries,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
SUM(CASE position WHEN 1 THEN 1 ELSE 0 END) AS Wins,
COUNT(DISTINCT raceId) AS Entries
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
GROUP BY drivers.driverId, forename, surname, nationality
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(race_wins_entries)
Lastly, it can be helpful to contextualize the results in terms of the highest percentage of races won. The subsequent query does just that:
win_percentage=results %>% # win_percentage
mutate(position=as.integer(position)) %>%
group_by(driverId) %>%
summarise(wins = sum(position == 1, na.rm = TRUE),
entries = n_distinct(raceId),
percentage=wins * 100.0/entries) %>%
left_join(drivers, "driverId") %>%
select(forename, surname, nationality, wins, entries, percentage) %>%
slice_max(wins, n = 10, with_ties = F)
all.equal(
win_percentage,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
SUM(CASE position WHEN 1 THEN 1 ELSE 0 END) AS Wins,
COUNT(DISTINCT raceId) AS Entries,
(SUM(CASE position WHEN 1 THEN 1 ELSE 0 END))*100.0/ (SELECT COUNT(DISTINCT raceId)) AS Percentage
FROM results
JOIN drivers
ON results.driverId = drivers.driverId
GROUP BY drivers.driverId, forename, surname, nationality
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(win_percentage %>% mutate(percentage=percentage/100), perc_vars="percentage")
The above query shows that the highest hit-rate (33.99%) belongs to Lewis Hamilton.
This query is conceptually similar to the previous one, it just uses a different aggregation function (COUNT rather than SUM) and an additional grouping condition (GROUP BY drivers.driverId, name).
most_win_same_gp=results %>% # most_win_same_gp
mutate(position=as.integer(position)) %>%
filter(position==1) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
group_by(driverId,name) %>%
summarize(wins = n(), .groups = 'keep') %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
ungroup() %>%
select(forename, surname, nationality, name, wins) %>%
slice_max(wins, n=10, with_ties = F)
all.equal(
most_win_same_gp,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, name, COUNT(*) AS Wins
FROM results
LEFT JOIN races
ON results.raceId = races.raceId
LEFT JOIN drivers
ON results.driverId = drivers.driverId
WHERE position = 1
GROUP BY drivers.driverId, name
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(most_win_same_gp)
This query makes use of a new table - driver_standings - which contains a very convenient field ‘wins’:
SELECT *
FROM driver_standings
LIMIT 10;
This field keeps a running total of the number of wins a driver (driverId = ‘…’) earned during a race season. This query, then, simply groups the data first by driver and then by year (GROUP BY drivers.driverId, year) and returns the greatest number of wins (MAX(wins)) that occured in that year.
most_wins_single_season=driver_standings %>% # most_wins_single_season
left_join(races, by = c("raceId" = "raceId")) %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
group_by(driverId, year) %>%
slice(which.max(wins)) %>%
ungroup() %>%
select(forename, surname, nationality, year, wins) %>%
# with_ties = F gives first n rows in desc order.
# T will return more rows than requested.
slice_max(wins,n=10, with_ties = F) #%>%
#arrange(desc(wins), surname)
all.equal(
most_wins_single_season,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality, year, MAX(wins) AS Wins
FROM driver_standings
LEFT JOIN drivers
ON driver_standings.driverId = drivers.driverId
LEFT JOIN races
ON driver_standings.raceId = races.raceId
GROUP BY drivers.driverId, year
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(most_wins_single_season)
In addition to examining wins, it is also interesting to derive podium (i.e. top \(3\)) finishes:
most_podiums_all_time=results %>% # most_podiums_all_time
group_by(driverId) %>%
mutate(position = case_when((position == 1) ~ 1,
(position == 2) ~ 1,
(position == 3) ~ 1,
TRUE ~ 0)
) %>%
summarise(podiums = sum(position),
entries = n_distinct(raceId),
percentage = podiums * 100.0/entries) %>%
left_join(drivers, by = c("driverId" = "driverId")) %>%
select(forename, surname, nationality, podiums, entries, percentage) %>%
slice_max(podiums, n = 10, with_ties = F)
all.equal(
most_podiums_all_time,
dbGetQuery(
conn = con,
statement = "SELECT forename, surname, nationality,
SUM(CASE position
WHEN 1 THEN 1
WHEN 2 THEN 1
WHEN 3 THEN 1
ELSE 0
END) AS Podiums,
COUNT(DISTINCT raceId) AS Entries,
SUM(CASE position WHEN 1 THEN 1 WHEN 2 THEN 1 WHEN 3 THEN 1 ELSE 0 END)*100.0/
COUNT(DISTINCT raceId) AS Percentage
FROM results
LEFT JOIN drivers
ON results.driverId = drivers.driverId
GROUP BY drivers.driverId
ORDER BY Podiums DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(most_podiums_all_time %>% mutate(percentage=percentage/100), perc_vars="percentage")
The query above uses a CASE statement
to record any \(1\), \(2\), or \(3\) position as 1
. The SUM of this then becomes Podiums
.
As in previous queries, total entries and percentage of podium finishes
are calculated as well.
These records will make use of \(2\) new tables: constructors and constructor_standings:
SELECT *
FROM constructors
LIMIT 10;
SELECT *
FROM constructor_standings
LIMIT 10;
most_winning_constructor=results %>% # most_winning_constructor
mutate(position=as.integer(position)) %>%
filter(position==1) %>%
left_join(races, by = c("raceId" = "raceId")) %>%
group_by(constructorId) %>%
summarise(first_win = min(year),
last_win = max(year),
wins = sum(position)) %>%
left_join(constructors, by = c("constructorId" = "constructorId")) %>%
select(name, first_win, last_win, wins) %>%
slice_max(wins, n=10, with_ties = F)
all.equal(
most_winning_constructor,
dbGetQuery(
conn = con,
statement = "SELECT constructors.name, MIN(year) AS First_Win, MAX(year) AS Last_Win,
SUM(position) AS Wins
FROM results
LEFT JOIN constructors
ON results.constructorId = constructors.constructorId
LEFT JOIN races
ON results.raceId = races.raceId
WHERE position = 1
GROUP BY results.constructorId
ORDER BY Wins DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(most_winning_constructor)
Ferrari, perhaps unsurprisingly, has the most wins in F1 history by a wide margin.
To avoid results skewed slightly towards teams with relatively few entries (Brawn, Kurtis Kraft, Epperly, Watson, etc.), we should probably filter out constructors with fewer than say, \(100\), entries. This will be the first query requiring a HAVING clause.
highest_pct_winning_constructor_all_time=results %>% # highest_pct_winning_constructor_all_time
mutate(position=as.integer(position)) %>%
group_by(constructorId) %>%
summarise(wins = sum(position == 1, na.rm = TRUE),
entries = n_distinct(raceId),
percentage = wins * 100.0/entries) %>%
left_join(constructors, by = c("constructorId" = "constructorId")) %>%
select(name, wins, entries, percentage) %>%
filter(entries >= 100) %>%
arrange(desc(percentage)) %>%
head(10)
all.equal(
highest_pct_winning_constructor_all_time,
dbGetQuery(
conn = con,
statement = "SELECT name, SUM(CASE position WHEN 1 THEN 1 ELSE 0 END) AS Wins,
COUNT(DISTINCT raceId) AS Entries,
(SUM(CASE position WHEN 1 THEN 1 ELSE 0 END))*100.0 /
(COUNT(DISTINCT raceId)) AS Percentage
FROM results
LEFT JOIN constructors
ON results.constructorId = constructors.constructorId
GROUP BY results.constructorId
HAVING Entries > 100
ORDER BY Percentage DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(highest_pct_winning_constructor_all_time %>% mutate(percentage=percentage/100), perc_vars="percentage")
Overall, it seems that Mercedes, followed by Red Bull and Ferrari have the highest win-percentages. But what about in a single season?
highest_pct_constructor_win_per_season=constructor_standings %>% # highest_pct_constructor_win_per_season
left_join(races, by = c("raceId" = "raceId")) %>%
left_join(constructors, by = c("constructorId" = "constructorId")) %>%
group_by(year, name.y) %>%
summarise(Wins = max(wins),
Races = n_distinct(raceId),
Percentage = Wins * 100.0/Races,
) %>%
select(name=name.y, year, Wins, Races, Percentage) %>%
arrange(desc(Percentage), year)%>%
head(10)
all.equal(
highest_pct_constructor_win_per_season,
dbGetQuery(
conn = con,
statement = "SELECT constructors.name, year, MAX(wins) AS Wins,
COUNT(DISTINCT constructor_standings.raceId) AS Races,
MAX(wins)*100.0/COUNT(DISTINCT constructor_standings.raceId) AS Percentage
FROM constructor_standings
LEFT JOIN races
ON constructor_standings.raceId = races.raceID
LEFT JOIN constructors
ON constructor_standings.constructorId=constructors.constructorId
GROUP BY year, constructors.name
ORDER BY Percentage DESC
LIMIT 10;"
),
check.attributes=F
)
[1] TRUE
format.dt.f(highest_pct_constructor_win_per_season %>% mutate(Percentage=Percentage/100), perc_vars="Percentage")
Here we see that McLaren’s 1988 campaign was (arguably) the most dominant single-season peformance by a constructor, with the team winning 93.75% of the races that year.