knitr::opts_chunk$set(echo = TRUE)
#loading libraries
library(tidyr)
library(dplyr)
##
## Присоединяю пакет: 'dplyr'
## Следующие объекты скрыты от 'package:stats':
##
## filter, lag
## Следующие объекты скрыты от 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
set.seed(42)
#step 1 loading datasets
results_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/results.csv", na.strings = "\\N")
drivers_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/drivers.csv", na.strings = "\\N")
constructors_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/constructors.csv", na.strings = "\\N")
#step 2 Datasets before clean
cat("=== results.csv ===\n")
## === results.csv ===
cat("Rows:", nrow(results_raw), "| Columns:", ncol(results_raw), "\n")
## Rows: 26759 | Columns: 18
cat("Columns:", paste(names(results_raw), collapse = ", "), "\n")
## Columns: resultId, raceId, driverId, constructorId, number, grid, position, positionText, positionOrder, points, laps, time, milliseconds, fastestLap, rank, fastestLapTime, fastestLapSpeed, statusId
head(results_raw, 3)
## resultId raceId driverId constructorId number grid position positionText
## 1 1 18 1 1 22 1 1 1
## 2 2 18 2 2 3 5 2 2
## 3 3 18 3 3 7 7 3 3
## positionOrder points laps time milliseconds fastestLap rank
## 1 1 10 58 1:34:50.616 5690616 39 2
## 2 2 8 58 +5.478 5696094 41 3
## 3 3 6 58 +8.163 5698779 41 5
## fastestLapTime fastestLapSpeed statusId
## 1 1:27.452 218.300 1
## 2 1:27.739 217.586 1
## 3 1:28.090 216.719 1
cat("\n=== drivers.csv ===\n")
##
## === drivers.csv ===
cat("Rows:", nrow(drivers_raw), "| Columns:", ncol(drivers_raw), "\n")
## Rows: 861 | Columns: 9
cat("Columns:", paste(names(drivers_raw), collapse = ", "), "\n")
## Columns: driverId, driverRef, number, code, forename, surname, dob, nationality, url
head(drivers_raw, 3)
## driverId driverRef number code forename surname dob nationality
## 1 1 hamilton 44 HAM Lewis Hamilton 1985-01-07 British
## 2 2 heidfeld NA HEI Nick Heidfeld 1977-05-10 German
## 3 3 rosberg 6 ROS Nico Rosberg 1985-06-27 German
## url
## 1 http://en.wikipedia.org/wiki/Lewis_Hamilton
## 2 http://en.wikipedia.org/wiki/Nick_Heidfeld
## 3 http://en.wikipedia.org/wiki/Nico_Rosberg
cat("\n=== constructors.csv ===\n")
##
## === constructors.csv ===
cat("Rows:", nrow(constructors_raw), "| Columns:", ncol(constructors_raw), "\n")
## Rows: 212 | Columns: 5
cat("Columns:", paste(names(constructors_raw), collapse = ", "), "\n")
## Columns: constructorId, constructorRef, name, nationality, url
head(constructors_raw, 3)
## constructorId constructorRef name nationality
## 1 1 mclaren McLaren British
## 2 2 bmw_sauber BMW Sauber German
## 3 3 williams Williams British
## url
## 1 http://en.wikipedia.org/wiki/McLaren
## 2 http://en.wikipedia.org/wiki/BMW_Sauber
## 3 http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering
#step 3 Cleaning Dataset "drivers"
# Taking out url и driverRef — technical noise without analytical value
# Creating driver_name = name + surname instead driverId
drivers_clean <- drivers_raw |>
select(driverId, forename, surname, nationality) |>
mutate(driver_name = paste(forename, surname)) |>
select(driverId, driver_name, nationality)
head(drivers_clean, 5)
## driverId driver_name nationality
## 1 1 Lewis Hamilton British
## 2 2 Nick Heidfeld German
## 3 3 Nico Rosberg German
## 4 4 Fernando Alonso Spanish
## 5 5 Heikki Kovalainen Finnish
#step 4 Cleaning Dataset "constructors"
# delete url и constructorRef — Placeholder Noise
# rename name → team_name
constructors_clean <- constructors_raw |>
select(constructorId, name) |>
rename(team_name = name)
head(constructors_clean, 5)
## constructorId team_name
## 1 1 McLaren
## 2 2 BMW Sauber
## 3 3 Williams
## 4 4 Renault
## 5 5 Toro Rosso
#step 5 cleaning dataset "Results"
# Remove time — Inconsistent Formatting:
# Winner: "1:34:50.616", others: "+5.478" — cannot be compared
# Remove statusId — no lookup table
# Remove fastestLapTime, fastestLapSpeed — missing for most
# Leave positionText — needed for reliability analysis (R = Retired)
results_clean <- results_raw |>
select(resultId, raceId, driverId, constructorId,
grid, positionText, positionOrder, points, laps)
head(results_clean, 5)
## resultId raceId driverId constructorId grid positionText positionOrder points
## 1 1 18 1 1 1 1 1 10
## 2 2 18 2 2 5 2 2 8
## 3 3 18 3 3 7 3 3 6
## 4 4 18 4 4 11 4 4 5
## 5 5 18 5 1 3 5 5 4
## laps
## 1 58
## 2 58
## 3 58
## 4 58
## 5 58
# step 5 merging datasets
# Replace IDs with real names of drivers and teams
merged <- results_clean |>
left_join(drivers_clean, by = "driverId") |>
left_join(constructors_clean, by = "constructorId")
cat("Merged:", nrow(merged), "rows x", ncol(merged), "columns\n")
## Merged: 26759 rows x 12 columns
head(merged |> select(raceId, driver_name, team_name, positionText, points), 8)
## raceId driver_name team_name positionText points
## 1 18 Lewis Hamilton McLaren 1 10
## 2 18 Nick Heidfeld BMW Sauber 2 8
## 3 18 Nico Rosberg Williams 3 6
## 4 18 Fernando Alonso Renault 4 5
## 5 18 Heikki Kovalainen McLaren 5 4
## 6 18 Kazuki Nakajima Williams 6 3
## 7 18 Sébastien Bourdais Toro Rosso 7 2
## 8 18 Kimi Räikkönen Ferrari 8 1
# step 6 making dataset from wide to tidy
# The grid, positionOrder, points, and laps columns are "wide" metrics.
# We convert them into a single metric column and a single value column.
merged_tidy <- merged |>
pivot_longer(
cols = c(grid, positionOrder, points, laps),
names_to = "metric",
values_to = "value"
) |>
drop_na(value) #we remove NA - structurally absent values
cat("Tidy:", nrow(merged_tidy), "rows x", ncol(merged_tidy), "columns\n")
## Tidy: 107036 rows x 10 columns
head(merged_tidy, 10)
## # A tibble: 10 × 10
## resultId raceId driverId constructorId positionText driver_name nationality
## <int> <int> <int> <int> <chr> <chr> <chr>
## 1 1 18 1 1 1 Lewis Hamilt… British
## 2 1 18 1 1 1 Lewis Hamilt… British
## 3 1 18 1 1 1 Lewis Hamilt… British
## 4 1 18 1 1 1 Lewis Hamilt… British
## 5 2 18 2 2 2 Nick Heidfeld German
## 6 2 18 2 2 2 Nick Heidfeld German
## 7 2 18 2 2 2 Nick Heidfeld German
## 8 2 18 2 2 2 Nick Heidfeld German
## 9 3 18 3 3 3 Nico Rosberg German
## 10 3 18 3 3 3 Nico Rosberg German
## # ℹ 3 more variables: team_name <chr>, metric <chr>, value <dbl>
# step 7 analysis drivers
# Top 6 drivers by points
top6 <- merged_tidy |>
filter(metric == "points") |>
group_by(driver_name) |>
summarise(total_points = sum(value, na.rm = TRUE)) |>
arrange(desc(total_points)) |>
slice_head(n = 6) |>
pull(driver_name)
cat("Top 6:", paste(top6, collapse = ", "), "\n\n")
## Top 6: Lewis Hamilton, Sebastian Vettel, Max Verstappen, Fernando Alonso, Kimi Räikkönen, Valtteri Bottas
# Wide pivot table: rows = races, columns = drivers
pivot_wide <- merged_tidy |>
filter(metric == "points", driver_name %in% top6) |>
select(raceId, driver_name, value) |>
pivot_wider(
names_from = driver_name,
values_from = value,
values_fill = 0
) |>
arrange(raceId)
print(pivot_wide)
## # A tibble: 462 × 7
## raceId `Lewis Hamilton` `Fernando Alonso` `Kimi Räikkönen` `Sebastian Vettel`
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 4 0 0
## 2 2 1 0 0 0
## 3 3 3 0 0 10
## 4 4 5 1 3 8
## 5 5 0 4 0 5
## 6 6 0 2 6 0
## 7 7 0 0 0 6
## 8 8 0 0 1 10
## 9 9 0 2 0 8
## 10 10 10 0 8 0
## # ℹ 452 more rows
## # ℹ 2 more variables: `Valtteri Bottas` <dbl>, `Max Verstappen` <dbl>
# for the graphic
pivot_long <- pivot_wide |>
pivot_longer(
cols = -raceId,
names_to = "driver_name",
values_to = "points"
)
#step 8 visualization
pivot_cumulative <- pivot_long |>
arrange(driver_name, raceId) |>
group_by(driver_name) |>
mutate(cumulative_points = cumsum(points))
ggplot(pivot_cumulative,
aes(x = raceId, y = cumulative_points,
color = driver_name, group = driver_name)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
labs(
title = "Driver Performance — Cumulative Points per Race",
subtitle = "Top 6 drivers, Pivot Analysis",
x = "Race ID",
y = "Cumulative Points",
color = "Driver",
caption = "Source: Ergast F1 API"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")

#step 9 Reliable teams
# "R" в positionText = The car broke down, we turn it into 1, otherwise 0
reliability <- merged |>
mutate(retired = ifelse(positionText == "R", 1, 0)) |>
group_by(team_name) |>
summarise(
total_races = n(),
total_retired = sum(retired, na.rm = TRUE),
failure_rate = round(total_retired / total_races * 100, 1)
) |>
filter(total_races >= 20) |> # только команды с достаточной выборкой
arrange(desc(failure_rate)) |>
slice_head(n = 15)
print(reliability)
## # A tibble: 15 × 4
## team_name total_races total_retired failure_rate
## <chr> <int> <dbl> <dbl>
## 1 Eagle-Weslake 21 18 85.7
## 2 Simca 29 20 69
## 3 Brabham-Alfa Romeo 59 38 64.4
## 4 Gordini 102 61 59.8
## 5 Vanwall 71 42 59.2
## 6 Leyton House 64 36 56.2
## 7 Stewart 98 55 56.1
## 8 Connaught 54 30 55.6
## 9 RAM 71 39 54.9
## 10 Shadow-Ford 26 14 53.8
## 11 Cooper-BRM 21 11 52.4
## 12 Larrousse 216 108 50
## 13 Simtek 40 20 50
## 14 Lola 165 82 49.7
## 15 Toleman 131 65 49.6
#step 9 visualization Reliable teams
ggplot(reliability,
aes(x = reorder(team_name, failure_rate),
y = failure_rate,
fill = failure_rate)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = paste0(failure_rate, "%")),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#fff3cd", high = "#dc3545") +
scale_y_continuous(limits = c(0, max(reliability$failure_rate) * 1.15)) +
labs(
title = "Team Reliability — % of Races Retired",
subtitle = "positionText = 'R' means car broke down",
x = "Team",
y = "Failure Rate (%)",
caption = "Source: Ergast F1 API | Min. 20 entries"
) +
theme_minimal(base_size = 13)

#step 10 worst and best teams
worst <- reliability |> slice(1)
best <- reliability |> slice(n())
cat("Most fragile:", worst$team_name,
"—", worst$failure_rate, "% failure rate (",
worst$total_retired, "of", worst$total_races, "races)\n")
## Most fragile: Eagle-Weslake — 85.7 % failure rate ( 18 of 21 races)
cat("Most reliable:", best$team_name,
"—", best$failure_rate, "% failure rate (",
best$total_retired, "of", best$total_races, "races)\n")
## Most reliable: Toleman — 49.6 % failure rate ( 65 of 131 races)