project2

knitr::opts_chunk$set(echo = TRUE)


#loading libraries

library(tidyr)
library(dplyr)

## 
## Присоединяю пакет: 'dplyr'

## Следующие объекты скрыты от 'package:stats':
## 
##     filter, lag

## Следующие объекты скрыты от 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

set.seed(42)

#step 1 loading datasets

results_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/results.csv", na.strings = "\\N")

drivers_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/drivers.csv", na.strings = "\\N")

constructors_raw <- read.csv("https://raw.githubusercontent.com/Chingiz1492/607-project-2/refs/heads/main/constructors.csv", na.strings = "\\N")

#step 2 Datasets before clean

cat("=== results.csv ===\n")

## === results.csv ===

cat("Rows:", nrow(results_raw), "| Columns:", ncol(results_raw), "\n")

## Rows: 26759 | Columns: 18

cat("Columns:", paste(names(results_raw), collapse = ", "), "\n")

## Columns: resultId, raceId, driverId, constructorId, number, grid, position, positionText, positionOrder, points, laps, time, milliseconds, fastestLap, rank, fastestLapTime, fastestLapSpeed, statusId

head(results_raw, 3)

##   resultId raceId driverId constructorId number grid position positionText
## 1        1     18        1             1     22    1        1            1
## 2        2     18        2             2      3    5        2            2
## 3        3     18        3             3      7    7        3            3
##   positionOrder points laps        time milliseconds fastestLap rank
## 1             1     10   58 1:34:50.616      5690616         39    2
## 2             2      8   58      +5.478      5696094         41    3
## 3             3      6   58      +8.163      5698779         41    5
##   fastestLapTime fastestLapSpeed statusId
## 1       1:27.452         218.300        1
## 2       1:27.739         217.586        1
## 3       1:28.090         216.719        1

cat("\n=== drivers.csv ===\n")

## 
## === drivers.csv ===

cat("Rows:", nrow(drivers_raw), "| Columns:", ncol(drivers_raw), "\n")

## Rows: 861 | Columns: 9

cat("Columns:", paste(names(drivers_raw), collapse = ", "), "\n")

## Columns: driverId, driverRef, number, code, forename, surname, dob, nationality, url

head(drivers_raw, 3)

##   driverId driverRef number code forename  surname        dob nationality
## 1        1  hamilton     44  HAM    Lewis Hamilton 1985-01-07     British
## 2        2  heidfeld     NA  HEI     Nick Heidfeld 1977-05-10      German
## 3        3   rosberg      6  ROS     Nico  Rosberg 1985-06-27      German
##                                           url
## 1 http://en.wikipedia.org/wiki/Lewis_Hamilton
## 2  http://en.wikipedia.org/wiki/Nick_Heidfeld
## 3   http://en.wikipedia.org/wiki/Nico_Rosberg

cat("\n=== constructors.csv ===\n")

## 
## === constructors.csv ===

cat("Rows:", nrow(constructors_raw), "| Columns:", ncol(constructors_raw), "\n")

## Rows: 212 | Columns: 5

cat("Columns:", paste(names(constructors_raw), collapse = ", "), "\n")

## Columns: constructorId, constructorRef, name, nationality, url

head(constructors_raw, 3)

##   constructorId constructorRef       name nationality
## 1             1        mclaren    McLaren     British
## 2             2     bmw_sauber BMW Sauber      German
## 3             3       williams   Williams     British
##                                                            url
## 1                         http://en.wikipedia.org/wiki/McLaren
## 2                      http://en.wikipedia.org/wiki/BMW_Sauber
## 3 http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering

#step 3 Cleaning Dataset "drivers"

# Taking out url и driverRef — technical noise without analytical value

# Creating driver_name = name + surname instead driverId

drivers_clean <- drivers_raw |>
  select(driverId, forename, surname, nationality) |>
  mutate(driver_name = paste(forename, surname)) |>
  select(driverId, driver_name, nationality)

head(drivers_clean, 5)

##   driverId       driver_name nationality
## 1        1    Lewis Hamilton     British
## 2        2     Nick Heidfeld      German
## 3        3      Nico Rosberg      German
## 4        4   Fernando Alonso     Spanish
## 5        5 Heikki Kovalainen     Finnish

#step 4 Cleaning Dataset "constructors"

# delete url и constructorRef — Placeholder Noise

# rename name → team_name

constructors_clean <- constructors_raw |>
  select(constructorId, name) |>
  rename(team_name = name)

head(constructors_clean, 5)

##   constructorId  team_name
## 1             1    McLaren
## 2             2 BMW Sauber
## 3             3   Williams
## 4             4    Renault
## 5             5 Toro Rosso

#step 5 cleaning dataset "Results"

# Remove time — Inconsistent Formatting:

# Winner: "1:34:50.616", others: "+5.478" — cannot be compared

# Remove statusId — no lookup table

# Remove fastestLapTime, fastestLapSpeed — missing for most

# Leave positionText — needed for reliability analysis (R = Retired)

results_clean <- results_raw |>
  select(resultId, raceId, driverId, constructorId,
         grid, positionText, positionOrder, points, laps)

head(results_clean, 5)

##   resultId raceId driverId constructorId grid positionText positionOrder points
## 1        1     18        1             1    1            1             1     10
## 2        2     18        2             2    5            2             2      8
## 3        3     18        3             3    7            3             3      6
## 4        4     18        4             4   11            4             4      5
## 5        5     18        5             1    3            5             5      4
##   laps
## 1   58
## 2   58
## 3   58
## 4   58
## 5   58

# step 5 merging datasets

# Replace IDs with real names of drivers and teams

merged <- results_clean |>
left_join(drivers_clean,      by = "driverId") |>
left_join(constructors_clean, by = "constructorId")

cat("Merged:", nrow(merged), "rows x", ncol(merged), "columns\n")

## Merged: 26759 rows x 12 columns

head(merged |> select(raceId, driver_name, team_name, positionText, points), 8)

##   raceId        driver_name  team_name positionText points
## 1     18     Lewis Hamilton    McLaren            1     10
## 2     18      Nick Heidfeld BMW Sauber            2      8
## 3     18       Nico Rosberg   Williams            3      6
## 4     18    Fernando Alonso    Renault            4      5
## 5     18  Heikki Kovalainen    McLaren            5      4
## 6     18    Kazuki Nakajima   Williams            6      3
## 7     18 Sébastien Bourdais Toro Rosso            7      2
## 8     18     Kimi Räikkönen    Ferrari            8      1

# step 6 making dataset from wide to tidy

# The grid, positionOrder, points, and laps columns are "wide" metrics.

# We convert them into a single metric column and a single value column.

merged_tidy <- merged |>
  pivot_longer(
    cols      = c(grid, positionOrder, points, laps),
    names_to  = "metric",
    values_to = "value"
  ) |>
  drop_na(value)  #we remove NA - structurally absent values

cat("Tidy:", nrow(merged_tidy), "rows x", ncol(merged_tidy), "columns\n")

## Tidy: 107036 rows x 10 columns

head(merged_tidy, 10)

## # A tibble: 10 × 10
##    resultId raceId driverId constructorId positionText driver_name   nationality
##       <int>  <int>    <int>         <int> <chr>        <chr>         <chr>      
##  1        1     18        1             1 1            Lewis Hamilt… British    
##  2        1     18        1             1 1            Lewis Hamilt… British    
##  3        1     18        1             1 1            Lewis Hamilt… British    
##  4        1     18        1             1 1            Lewis Hamilt… British    
##  5        2     18        2             2 2            Nick Heidfeld German     
##  6        2     18        2             2 2            Nick Heidfeld German     
##  7        2     18        2             2 2            Nick Heidfeld German     
##  8        2     18        2             2 2            Nick Heidfeld German     
##  9        3     18        3             3 3            Nico Rosberg  German     
## 10        3     18        3             3 3            Nico Rosberg  German     
## # ℹ 3 more variables: team_name <chr>, metric <chr>, value <dbl>

# step 7 analysis drivers

# Top 6 drivers by points 

top6 <- merged_tidy |>
  filter(metric == "points") |>
  group_by(driver_name) |>
  summarise(total_points = sum(value, na.rm = TRUE)) |>
  arrange(desc(total_points)) |>
  slice_head(n = 6) |>
  pull(driver_name)

cat("Top 6:", paste(top6, collapse = ", "), "\n\n")

## Top 6: Lewis Hamilton, Sebastian Vettel, Max Verstappen, Fernando Alonso, Kimi Räikkönen, Valtteri Bottas

# Wide pivot table: rows = races, columns = drivers
pivot_wide <- merged_tidy |>
  filter(metric == "points", driver_name %in% top6) |>
  select(raceId, driver_name, value) |>
  pivot_wider(
    names_from  = driver_name,
    values_from = value,
    values_fill = 0
  ) |>
  arrange(raceId)

print(pivot_wide)

## # A tibble: 462 × 7
##    raceId `Lewis Hamilton` `Fernando Alonso` `Kimi Räikkönen` `Sebastian Vettel`
##     <int>            <dbl>             <dbl>            <dbl>              <dbl>
##  1      1                0                 4                0                  0
##  2      2                1                 0                0                  0
##  3      3                3                 0                0                 10
##  4      4                5                 1                3                  8
##  5      5                0                 4                0                  5
##  6      6                0                 2                6                  0
##  7      7                0                 0                0                  6
##  8      8                0                 0                1                 10
##  9      9                0                 2                0                  8
## 10     10               10                 0                8                  0
## # ℹ 452 more rows
## # ℹ 2 more variables: `Valtteri Bottas` <dbl>, `Max Verstappen` <dbl>

# for the graphic 

pivot_long <- pivot_wide |>
  pivot_longer(
    cols      = -raceId,
    names_to  = "driver_name",
    values_to = "points"
  )
  
#step 8 visualization

pivot_cumulative <- pivot_long |>
  arrange(driver_name, raceId) |>
  group_by(driver_name) |>
  mutate(cumulative_points = cumsum(points))

ggplot(pivot_cumulative,
       aes(x = raceId, y = cumulative_points,
           color = driver_name, group = driver_name)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  labs(
    title    = "Driver Performance — Cumulative Points per Race",
    subtitle = "Top 6 drivers, Pivot Analysis",
    x        = "Race ID",
    y        = "Cumulative Points",
    color    = "Driver",
    caption  = "Source: Ergast F1 API"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

#step 9 Reliable teams

# "R" в positionText = The car broke down, we turn it into 1, otherwise 0

reliability <- merged |>
  mutate(retired = ifelse(positionText == "R", 1, 0)) |>
  group_by(team_name) |>
  summarise(
    total_races   = n(),
    total_retired = sum(retired, na.rm = TRUE),
    failure_rate  = round(total_retired / total_races * 100, 1)
  ) |>
  filter(total_races >= 20) |>   # только команды с достаточной выборкой
  arrange(desc(failure_rate)) |>
  slice_head(n = 15)

print(reliability)

## # A tibble: 15 × 4
##    team_name          total_races total_retired failure_rate
##    <chr>                    <int>         <dbl>        <dbl>
##  1 Eagle-Weslake               21            18         85.7
##  2 Simca                       29            20         69  
##  3 Brabham-Alfa Romeo          59            38         64.4
##  4 Gordini                    102            61         59.8
##  5 Vanwall                     71            42         59.2
##  6 Leyton House                64            36         56.2
##  7 Stewart                     98            55         56.1
##  8 Connaught                   54            30         55.6
##  9 RAM                         71            39         54.9
## 10 Shadow-Ford                 26            14         53.8
## 11 Cooper-BRM                  21            11         52.4
## 12 Larrousse                  216           108         50  
## 13 Simtek                      40            20         50  
## 14 Lola                       165            82         49.7
## 15 Toleman                    131            65         49.6

#step 9 visualization Reliable teams

ggplot(reliability,
       aes(x = reorder(team_name, failure_rate),
           y = failure_rate,
           fill = failure_rate)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = paste0(failure_rate, "%")),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#fff3cd", high = "#dc3545") +
  scale_y_continuous(limits = c(0, max(reliability$failure_rate) * 1.15)) +
  labs(
    title    = "Team Reliability — % of Races Retired",
    subtitle = "positionText = 'R' means car broke down",
    x        = "Team",
    y        = "Failure Rate (%)",
    caption  = "Source: Ergast F1 API | Min. 20 entries"
  ) +
  theme_minimal(base_size = 13)

#step 10 worst and best teams
  worst <- reliability |> slice(1)
best  <- reliability |> slice(n())

cat("Most fragile:", worst$team_name,
    "—", worst$failure_rate, "% failure rate (",
    worst$total_retired, "of", worst$total_races, "races)\n")

## Most fragile: Eagle-Weslake — 85.7 % failure rate ( 18 of 21 races)

cat("Most reliable:", best$team_name,
    "—", best$failure_rate, "% failure rate (",
    best$total_retired, "of", best$total_races, "races)\n")

## Most reliable: Toleman — 49.6 % failure rate ( 65 of 131 races)

project2

chingiz rabdanov

2026-03-08