Cracking the Code: Reverse Engineering the iRacing Championship Scoring Formula

#############################################
# Connect to iRacing API
# 
###########################################
library(httr2)
library(base64enc)
library(jsonlite)
library(dplyr)
library(usethis)
library(purrr)
library(ggplot2)

# RUN THIS TO AUTHENTICATE WITH IRACING BEFORE REQUESTING DATA
# Get Credentials
ir_get_token <- function() {
  # Load credentials from environment
  client_id  <- Sys.getenv("IRACING_CLIENT_ID")
  username   <- Sys.getenv("IRACING_USERNAME")
  secret     <- Sys.getenv("IRACING_CLIENT_SECRET")
  password   <- Sys.getenv("IRACING_PASSWORD")
  
  # Masking function (your current version)
  mask_value <- function(secret, id) {
    normalized_id <- tolower(trimws(id))
    input <- paste0(secret, normalized_id)
    
    hash_raw <- openssl::sha256(charToRaw(input))
    openssl::base64_encode(hash_raw)   # includes padding, as your version does
  }
  
  # Mask both secret and password
  masked_secret   <- mask_value(secret, client_id)
  masked_password <- mask_value(password, username)
  
  # Build and send the OAuth request
  req <- httr2::request("https://oauth.iracing.com/oauth2/token") |>
    httr2::req_headers(
      "Content-Type" = "application/x-www-form-urlencoded"
    ) |>
    httr2::req_body_form(
      grant_type    = "password_limited",
      client_id     = client_id,
      client_secret = masked_secret,
      username      = username,
      password      = masked_password,
      scope         = "iracing.auth iracing.profile"
    )
  
  resp <- httr2::req_perform(req)
  
  # Parse the JSON and return the token
  token_json <- jsonlite::fromJSON(rawToChar(resp$body))
  token_json$access_token
}

# Get a fresh token
#token <- ir_get_token()

##################################################
# Helper functions
##################################################

# Generic helper to call an iRacing Data API endpoint
# - endpoint: path under /data/, e.g. "results/get"
# - token:    OAuth access token (Bearer)
# - query:    named list of query parameters
ir_get_data <- function(endpoint, token, query = list()) {
  # Build the base URL for the metadata request
  base <- paste0("https://members-ng.iracing.com/data/", endpoint)
  
  # First request: get metadata containing the temporary link
  meta_resp <- request(base) %>%
    req_headers(Authorization = paste("Bearer", token)) %>%
    req_url_query(!!!query) %>%   # Add query parameters if provided
    req_perform()
  
  # Parse metadata to extract the link
  meta <- jsonlite::fromJSON(rawToChar(meta_resp$body))
  data_url <- meta$link
  
  # Second request: download the actual data (no token needed)
  data_resp <- request(data_url) %>%
    req_perform()
  
  # Parse and return the JSON data
  jsonlite::fromJSON(rawToChar(data_resp$body))
}

# Convenience wrapper: get results for a specific subsession_id
ir_get_subsession <- function(subsession_id, token) {
  ir_get_data(
    endpoint = "results/get",
    token    = token,
    query    = list(subsession_id = subsession_id)
  )
}

# ---------------------------------------------
# Get all subsession IDs (all splits) for a session
# ---------------------------------------------
ir_get_session_subsessions <- function(subsession_id, token) {
  # 1) Fetch the race results JSON for the given subsession
  raw_results <- ir_get_subsession(subsession_id = subsession_id, token = token)
  
  # 2) Extract the associated subsession IDs field
  #    Adjust the name here if the field is named slightly differently
  all_ids <- raw_results$associated_subsession_ids
  
  # 3) Make sure the original subsession_id is included
  all_ids <- c(all_ids, subsession_id)
  
  # 4) Return unique, sorted IDs
  sort(unique(all_ids))
}

# ---------------------------------------------
# Fetch results for each subsession ID
# ---------------------------------------------

# Given a vector of subsession_ids, download each subsession's results JSON
ir_get_multiple_subsessions <- function(subsession_ids, token) {
  
  # Use purrr::map to iterate over each subsession_id
  results_list <- purrr::map(
    subsession_ids,
    function(id) {
      # Fetch the subsession JSON
      ir_get_subsession(subsession_id = id, token = token)
    }
  )
  
  # Name each list element by its subsession_id for clarity
  names(results_list) <- as.character(subsession_ids)
  
  results_list
}

###########################################
# Parser
# There is a lot more data available than I' pulling here.
# Requires `race_results` pulled from iRacing
ir_parse_subsession <- function(x) {
  
  # 1. Meta (top-level)
  meta <- tibble::tibble(
    subsession_id          = x$subsession_id,
    session_id             = x$session_id,
    series_id              = x$series_id,
    series_name            = x$series_name,
    season_id              = x$season_id,
    season_year            = x$season_year,
    season_quarter         = x$season_quarter,
    event_type_name        = x$event_type_name,
    num_drivers            = x$num_drivers,
    num_cautions           = x$num_cautions,
    num_lead_changes       = x$num_lead_changes,
    event_strength_of_field = x$event_strength_of_field,
    event_best_lap_time    = x$event_best_lap_time,
    end_time               = x$end_time
  )
  
  # 4. Normalize session_results to a list of rows
  session_results <- x$session_results
  
  if (is.data.frame(session_results)) {
    session_results <- split(session_results, seq_len(nrow(session_results)))
  }
  
  # 5. Safely extract the RACE session (simsession_number == 0)
  race_session <- purrr::keep(
    session_results,
    function(elem) {
      is.list(elem) &&
        "simsession_number" %in% names(elem) &&
        elem$simsession_number == 0
    }
  )
  
  # If no race session exists, return meta but empty drivers/weather
  if (length(race_session) == 0) {
    return(list(
      meta    = meta,
      weather = tibble(),
      drivers = tibble()
    ))
  }
  
  # Test by pulling first race session
  race_session <- race_session[[1]]
  
  # 6. Session metadata
  sessions <- tibble::tibble(
    simsession_number    = race_session$simsession_number,
    simsession_name      = race_session$simsession_name,
    simsession_type      = race_session$simsession_type,
    simsession_type_name = race_session$simsession_type_name
  )
  
  # 7. Weather (simple list)
  weather <- if (!is.null(race_session$weather_result)) {
    tibble::as_tibble(race_session$weather_result)
  } else {
    tibble::tibble()
  }
  
  # 8. Drivers (list of 31)
  if (is.null(race_session$results)) {
    drivers <- tibble::tibble()
  } else {
    drivers <- purrr::map_dfr(
      race_session$results,
      function(d) {
        d$helmet <- NULL
        d$suit   <- NULL
        d$livery <- NULL
        tibble::as_tibble(d)
      }
    )
  }
  
  list(
    meta             = meta,
    # car_classes      = car_classes,
    # allowed_licenses = allowed_licenses,
    # race_summary     = race_summary,
    sessions         = sessions,
    weather          = weather,
    drivers          = drivers
  )
}

What if you could look at a game’s scoreboard and say, “I know exactly how that number was calculated”? Not guess. Not approximate. Know.

iRacing is a subscription-based online racing simulator that recreates real-world race cars and tracks with professional-grade physics and laser-scanned accuracy. Used by everyday drivers and professional racers alike, it hosts organized online races around the clock and is officially licensed by major motorsports organizations such as NASCAR and IndyCar — with many real-world drivers using it for training and some competitors even advancing from sim racing into professional motorsports.

In this project, we are going to do something that feels a little bit like being a detective and a little bit like being an engineer: we are going to reverse engineer the championship points formula used by iRacing.

We were not given the formula. There is no official documentation explaining how points are computed from finishing position, strength of field, and field size. All we have are outcomes — race results and the points awarded.

From that information alone, we will:

Identify patterns in how points change with finishing position
Discover how field size affects scoring
Uncover how “Strength of Field” scales the entire system
Build a mathematical model that reproduces the scoring rule
Test that model on new races to see if it holds up

Along the way, we’ll use tools from data science — but we’ll explain everything in plain language. If you are comfortable with algebra, basic programming, and logical reasoning, you will be able to follow every step.

By the end, we won’t just have a model that predicts points.

We will understand the structure of the scoring system.

And that is far more satisfying.

Let’s start the engine. 🏎️

🧪 Phase 1 — Our First Hypothesis

We need a starting point.

When you see championship points awarded after a race, three things immediately stand out:

Strength of Field (SoF) — how strong the average driver is in that split
Finishing Position — where you finished
Championship Points — the number we’re trying to explain

If we’re thinking like engineers, the most natural first guess is:

Championship Points are determined by Strength of Field and Finishing Position.

In mathematical form, our first hypothesis looks like this:

\[ \text{Championship Points} = a + b_1(\text{SoF}) + b_2(\text{Finishing Position}) \]

This is called a linear model.
It assumes points increase steadily as SoF increases, and decrease steadily as finishing position gets worse.

Simple. Clean. Reasonable.

Let’s test it.

#####################################################################
# It's Show Time!  
# I'll feed it one of my race subsession ID's and it
# gives me all of the subsessions. 
# Subsessions are spit by each driver's iRating. Drivers with similar ratings, race
# against each other. The average iRating of the subsession is the Strength of Field (SoF)
token <- ir_get_token()

#This code replaced March 11, 2026; delete old code before publishing
# Pull all subsessions from the baseline race
session_subsessions <- ir_get_session_subsessions(83219694, token)

# Fetch JSON for each subsession
all_split_json <- ir_get_multiple_subsessions(session_subsessions, token)

# Parse each subsession
parsed_splits <- purrr::map(all_split_json, ir_parse_subsession)

# Extract ALL drivers (full field)
extract_drivers <- function(parsed) {
  parsed$drivers %>%
    transmute(
      subsession_id = parsed$meta$subsession_id,
      sof           = parsed$meta$event_strength_of_field,
      finish_pos    = finish_position + 1,
      champ_points  = champ_points
    )
}

# Combine all drivers across all splits
finisher_data <- purrr::map_dfr(parsed_splits, extract_drivers)
# # Each race/subsession file includes a field with all the subsessions which we need for the next part
# # Go get the baseline race; we're mostly interested in subsesson id
# session_subsessions <- ir_get_session_subsessions(83219694, token)
# 
# # Use the list of ids to request each of their results
# # Fetch all split JSONs
# all_split_json <- ir_get_multiple_subsessions(session_subsessions, token)
# 
# # Inspect the structure
# 
# #length(all_split_json)
# #names(all_split_json)
# 
# parsed_splits <- purrr::map(all_split_json, ir_parse_subsession)
# 
# # Pull all subsessions from the baseline race
# session_subsessions <- ir_get_session_subsessions(83219694, token)
# 
# # Fetch JSON for each subsession
# all_split_json <- ir_get_multiple_subsessions(session_subsessions, token)
# 
# # Parse each subsession
# parsed_splits <- purrr::map(all_split_json, ir_parse_subsession)
# 
# # Extract ALL drivers (full field)
# extract_drivers <- function(parsed) {
#   parsed$drivers %>%
#     transmute(
#       subsession_id = parsed$meta$subsession_id,
#       sof           = parsed$meta$event_strength_of_field,
#       finish_pos    = finish_position + 1,
#       champ_points  = champ_points
#     )
# }
# 
# # Combine all drivers across all splits
# finisher_data <- purrr::map_dfr(parsed_splits, extract_drivers)

📦 Gathering the Data

We start with one race event that had 12 splits.

In iRacing, large events are divided into splits so drivers of similar iRating compete against each other. Each split has:

A different Strength of Field
The same track and race format
A full finishing order
Championship points awarded to every driver

This gives us a perfect test case: Multiple races, same structure, different SoF.

We collect:

sof
finish_pos
champ_points

for every driver in every split.

🔎 A Random Cross‑Section of the Data

To ground ourselves in the raw structure of the system, we pulled ten random drivers across all splits and sorted them by Strength of Field (SoF). Strength of Field is the average iRating of all drivers in the race. You can see the iRating of each river in the image above in the blue box under License; i.e. 2927 & 2804.

Even this small snapshot reveals important patterns.

set.seed(124)

finisher_data %>%
  slice_sample(n = 10) %>%
  arrange(desc(sof))

## # A tibble: 10 × 4
##    subsession_id   sof finish_pos champ_points
##            <int> <int>      <dbl>        <int>
##  1      83219689  5734         20          122
##  2      83219689  5734         22          100
##  3      83219691  3501         12          136
##  4      83219694  2171         12           86
##  5      83219695  1939         30            4
##  6      83219695  1939         14           69
##  7      83219695  1939          2          117
##  8      83219697  1503         14           52
##  9      83219699  1102         14           38
## 10      83219700   757         10           33

1️⃣ High SoF Produces Larger Point Totals

Even within this small random cross‑section, the influence of Strength of Field (SoF) is unmistakable.

Notice the two drivers who finished 13th:

At SoF = 3501, 12th place earns 136 championship points.
At SoF = 2171, 13th place earns 86 championship points.

Same finishing position. Different competitive environment. A 1330‑point difference in SoF translates into a 50‑point difference in championship payout.

The pattern becomes even clearer when comparing identical mid‑field finishes. A 14th‑place result in a higher‑rated split earns more points than the same finishing position in a lower‑rated split. The finishing order alone does not determine reward — the quality of the field scales the entire payout structure.

At the top of the table, the effect is magnified. The SoF 5734 race pays dramatically larger totals across the board than races in the 1100–1500 range. The system is not simply ranking drivers within a race; it is weighting performance by the competitive strength of the environment.

This is our first structural clue:

Championship points are not absolute.
They are proportional.

Finishing position determines where you sit on the payout curve.
Strength of Field determines how tall that curve is.

2️⃣ Low SoF Compresses the Scale

In weaker splits, even respectable finishes earn far fewer points.

Near the back of the field, points approach zero quickly.

This suggests that the scoring curve is not fixed across races.
Instead, the total number of available points expands and contracts depending on SoF.

3️⃣ The System Appears Structured, Not Noisy

Notice how clean the numbers are.

Points decrease steadily as finishing position worsens.
There are no sudden jumps.
The decline appears smooth and consistent within each split.

That smoothness is important.

It suggests we are not dealing with arbitrary lookup tables or discrete tiers.

We are likely looking at a mathematical rule.

🧠 What This Means

From just a random slice of the data, we can already infer:

Championship points are strongly tied to Strength of Field.
Finishing position matters in a consistent and orderly way.
The system appears continuous and formula‑driven.

Now that we’ve observed the structure qualitatively,
it’s time to test our first quantitative hypothesis.

📊 First Look at the Shape of the System

Before fitting any model, let’s just look at all the raw data.

For every driver in every split, we know:

Strength of Field (SoF)
Finishing Position
Championship Points

If points were determined by a simple additive rule, we would expect:

A straight, evenly spaced decline as finishing position increases
Parallel curves across different SoF values

So let’s visualize it.

Plot of Sof vs Finishing Position for 12 splits

# Look at a char of SoF vs Finishing Position
#library(ggplot2)

ggplot(finisher_data, aes(x = finish_pos, 
                          y = champ_points, 
                          color = sof)) +
  geom_point(size = 2.5, alpha = 0.8) +
  scale_color_viridis_c(option = "plasma") +
  scale_x_reverse() +
  labs(
    title = "Championship Points by Finishing Position",
    subtitle = "Color indicates Strength of Field (SoF)",
    x = "Finishing Position",
    y = "Championship Points",
    color = "SoF"
  ) +
  theme_minimal(base_size = 14)

🧠 What This Chart Is Telling Us

This picture is incredibly informative.

Before writing a single equation, we can already see several structural truths about the scoring system.

1️⃣ The Relationship Is Linear Within Each Split

Each color band (each Strength of Field) forms an almost perfectly straight line.

That means:

As finishing position worsens, points decrease at a steady rate.
The drop from 1st to 2nd is similar to the drop from 10th to 11th within the same split.

There is no visible curvature inside a single race.
That strongly suggests the scoring rule is linear in finishing position.

2️⃣ Last Place Always Gets About Zero

At the far left of the graph (last place), every split converges very close to zero points.

This is important.

It suggests the scoring rule is anchored:

The last finisher receives approximately zero championship points, regardless of Strength of Field.

That gives us a boundary condition — and boundary conditions are gold when reverse engineering formulas.

last_place_summary <- finisher_data %>%
  group_by(subsession_id) %>%
  filter(finish_pos == max(finish_pos, na.rm = TRUE)) %>%
  ungroup() %>%
  select(subsession_id, sof, finish_pos, champ_points) %>%
  arrange(desc(sof))

last_place_summary

## # A tibble: 12 × 4
##    subsession_id   sof finish_pos champ_points
##            <int> <int>      <dbl>        <int>
##  1      83219689  5734         31            5
##  2      83219690  4247         31            4
##  3      83219691  3501         31            3
##  4      83219692  2979         31            3
##  5      83219693  2525         31            2
##  6      83219694  2171         31            0
##  7      83219695  1939         31            2
##  8      83219696  1731         30            0
##  9      83219697  1503         30            1
## 10      83219698  1314         30            1
## 11      83219699  1102         30            1
## 12      83219700   757         30            0

3️⃣ The Winner’s Points Scale With Strength of Field

Look at the right side of the graph (1st place).

As the color shifts from dark purple (low SoF) to bright yellow (high SoF):

The winner’s points increase dramatically.
The entire line stretches upward.

This tells us something crucial:

Strength of Field does not just add a constant bonus. It scales the entire scoring curve.

If SoF were simply additive, we would see parallel lines stacked evenly apart.

Instead, we see the spacing between splits growing larger at the front of the field.

That suggests multiplication, not addition.

4️⃣ The Slope Changes With SoF

Notice that higher SoF splits are not just shifted upward.

They are steeper.

That means the rate at which points decrease across finishing positions changes depending on SoF.

This is the key structural insight:

The effect of finishing position depends on Strength of Field.

In other words, the two variables are interacting.

🚦 What We Now Know

From this one visualization, we can confidently say:

Points decrease linearly within each race.
Last place is anchored near zero.
The winner’s points scale strongly with SoF.
The slope of the scoring line depends on SoF.

This already tells us that our first hypothesis —

\[ \text{Points} = a + b_1(\text{SoF}) + b_2(\text{Finishing Position}) \]

— is probably too simple.

The system is not purely additive.

It has structure.

And that structure is what we’re about to uncover.

Now we’re ready to quantify how wrong our first model really is.

Model Summary & Metrics

# Hypothesis 1 model: SOF + Finishing Position
m_h1 <- lm(champ_points ~ sof + finish_pos, data = finisher_data)

summary(m_h1)

## 
## Call:
## lm(formula = champ_points ~ sof + finish_pos, data = finisher_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.464 -11.456   1.606  11.893  85.867 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.4197725  3.3337403   25.02   <2e-16 ***
## sof          0.0297993  0.0008911   33.44   <2e-16 ***
## finish_pos  -5.1556644  0.1402970  -36.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.73 on 364 degrees of freedom
## Multiple R-squared:  0.8693, Adjusted R-squared:  0.8686 
## F-statistic:  1210 on 2 and 364 DF,  p-value: < 2.2e-16

📈 Hypothesis 1 — The Additive Linear Model

We now test our first formal hypothesis:

\[ \text{Championship Points} = a + b_1(\text{SoF}) + b_2(\text{Finishing Position}) \]

Fitting this model produces the following equation:

\[ \widehat{\text{Points}} = 83.42 + 0.0298(\text{SoF}) - 5.16(\text{Finish Position}) \]

Let’s interpret what this means.

✅ Coefficient Interpretation

SoF coefficient = 0.0298

For every 1000‑point increase in Strength of Field, predicted championship points increase by about:

\[ 0.0298 \times 1000 \approx 29.8 \text{ points} \]

Stronger fields → more points.
Finish Position coefficient = −5.16

Each step down the finishing order reduces points by about 5.16.

So dropping from 1st to 2nd costs ~5 points.
Dropping from 10th to 11th costs ~5 points.

The model assumes the decline is perfectly linear.

🎯 Statistical Fit

The model reports:

\(R^2 = 0.869\)
Residual Standard Error ≈ 23.7
All coefficients highly significant (p < 2e−16)

An \(R^2\) of 0.87 sounds excellent.

It suggests that nearly 87% of the variation in championship points is explained by just two variables.

If we stopped here, we might think:

“This model works really well.”

But statistics alone don’t tell the full story.

finisher_data <- finisher_data %>%
  mutate(
    pred_h1_raw = predict(m_h1, newdata = .), # Make Predictions using model
    pred_h1     = round(pred_h1_raw),
    error_h1    = champ_points - pred_h1
  )

📏 How Wrong Are We, On Average?

To measure practical accuracy, we computed:

Mean Absolute Error (MAE)
Root Mean Squared Error (RMSE)

Mean Absolute Error (MAE)

This is the average size of our mistakes, ignoring whether they are positive or negative.

If MAE = 10, that means we are off by 10 points on average.

# Accuracy metrics
mae_h1  <- mean(abs(finisher_data$error_h1))
mae_h1

## [1] 16.79564

Root Mean Squared Error (RMSE)

This is similar to MAE, but it penalizes large mistakes more heavily.
It gives us a sense of how bad the worst misses are.

If MAE tells us “typical mistake,”
RMSE tells us “how painful are the big ones?”

rmse_h1 <- sqrt(mean(finisher_data$error_h1^2))
rmse_h1

## [1] 23.62306

On average, our predictions are off by about 17 championship points per driver.

And larger errors can exceed 20–30 points.

That’s not trivial.

In a competitive season, 17 points can separate multiple positions in the standings.

Results:

MAE ≈ 16.80 points
RMSE ≈ 23.62 points

What does that mean?

Official championship points are obtained by rounding the continuous scoring function to the nearest integer.

I also evaluated alternative integer conversion rules (e.g., flooring). Empirically, rounding to the nearest integer eliminated systematic bias in residuals and matched official championship totals most closely, indicating that iRacing rounds rather than truncates point values.

🧠 First Conclusion

The model clearly captures the direction of the scoring system:

Higher SoF → more points
Better finishing position → more points

But the error magnitude tells us something important:

The structure is close… but not quite right.

The additive assumption may be too simple.

The next step is not to add more variables.

It is to look at the residuals and understand where the model is systematically wrong.

📉 Actual vs Predicted — Something Is Off

If our model were structurally correct, the points would cluster tightly around the red 45° line.

That line represents perfect predictions:

\[ \text{Actual Points} = \text{Predicted Points} \]

Instead, we see a clear pattern.

# Actual vs Predicted Championship Points
ggplot(finisher_data, aes(x = pred_h1, y = champ_points)) +
  geom_point(size = 2.5, color = "steelblue", alpha = 0.8) +
  geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
  labs(
    title = "Hypothesis 1: Actual vs Predicted Championship Points",
    subtitle = "Model: champ_points ~ SoF + Finishing Position",
    x = "Predicted Points",
    y = "Actual Championship Points"
  ) +
  theme_minimal(base_size = 14)

1️⃣ The Model Underpredicts High Scores

On the right side of the chart (large predicted values), the actual points curve sharply above the red line.

That means:

When the stakes are highest — strong fields and top finishes — the model consistently underestimates championship points.

The gap grows larger as predicted points increase.

This is not random noise.

It is systematic.

2️⃣ The Model Overpredicts Mid‑Range Outcomes

In the middle of the plot, many observations fall below the red line.

Here, the model is predicting more points than drivers actually receive.

Again, this is not scattered randomness — it forms structure.

3️⃣ The Pattern Is Curved

The most important observation:

The cloud of points bends.

It does not form a straight band around the red line.

That curvature tells us something fundamental:

The true scoring relationship is not purely additive and linear.

Our model assumes a flat plane:

\[ \text{Points} = a + b_1(\text{SoF}) + b_2(\text{Finish Position}) \]

But the data appears to live on a curved surface.

🧠 First Structural Insight

The additive model captures direction.

But it fails at the extremes:

It cannot stretch enough at the top.
It does not compress correctly in the middle.
It misses the shape of the scoring curve.

This is not just error.

This is model misspecification.

To understand exactly where the model breaks, we now examine the residuals directly.

📉 Residuals vs Finishing Position — The Structural Failure

If our model were correctly specified, the residuals would look like random noise scattered around zero.

Instead, we see a very clear pattern.

# Error vs Finishing Positon
ggplot(finisher_data, aes(x = finish_pos, y = error_h1)) +
  geom_point(size = 2.5, color = "gray40", alpha = 0.8) +
  geom_hline(yintercept = 0, color = "blue", linewidth = 0.8) +
  labs(
    title = "Residuals for Hypothesis 1 Model",
    subtitle = "Model: SoF + Finishing Position",
    x = "Finishing Position",
    y = "Error (Actual - Predicted)"
  ) +
  theme_minimal(base_size = 14)

1️⃣ A Strong V‑Shape Pattern

The residuals form a pronounced V shape:

Large positive errors at the front of the field
Errors near zero in the middle
Large negative errors at the back

This means:

The model underestimates points for top finishers
It performs reasonably in the mid‑field
It overestimates points for back‑markers

That is not randomness.

That is structure.

2️⃣ The Slope Is Not Constant

Our model assumes that every step down in finishing position reduces points by a fixed amount:

\[ \text{Points decrease by } 5.16 \text{ per position} \]

But this residual pattern shows that the true decline is not constant.

The scoring curve is steeper at the front and back, and flatter in the middle.

In other words:

The effect of finishing position depends on where you are in the field.

3️⃣ This Is Classic Misspecification

When residuals display curvature, it means the true relationship is nonlinear.

The additive linear model forces the data onto a flat plane.

But the actual scoring system appears to follow a curved surface.

This explains:

Why the Actual vs Predicted plot bent away from the 45° line
Why errors grow larger at the extremes
Why the MAE remains substantial despite a high \(R^2\)

🧠 The Key Insight

The failure is not about missing variables.

It is about missing structure.

The scoring system is not simply:

\[ a + b_1(\text{SoF}) + b_2(\text{Finish Position}) \]

The two variables interact.

The effect of finishing position changes as the scale of the race changes.

And that suggests the next hypothesis:

We need a model where finishing position is scaled or normalized — not treated as a simple linear decrement.

🚀 Where Do We Go From Here?

The additive model failed because it treated Strength of Field as a linear contributor.

But the normalization step revealed something deeper:

SoF does not add points.

It scales the entire scoring curve.

Once we divide by SoF, the curves largely collapse onto one another.

This suggests the scoring system may follow a multiplicative structure:

\[ \text{Points} = \text{SoF} \times f(\text{Finishing Position}) \]

That is a fundamentally different model.

However, the normalized curves are not perfectly identical.

They are close — but not exact.

Which means one more structural element is shaping the curve.

The next step is to understand what determines the remaining differences.

🔎 Hypothesis 2 - Adding Field Size

After establishing that Strength of Field acts as a multiplicative scaling factor, we considered whether field size might explain the remaining variation in the scoring curves.

However, the original 12 races contained only 30 or 31 drivers.
With such limited variation, field size was effectively held constant.
As expected, including it in an additive model produced little to no improvement.

To properly test its influence, we expanded the dataset to include races ranging from 23 to 37 drivers.

This increased variation modestly improved model fit, but the additive specification still failed to capture the true structure of the scoring curve.

At this point, it became clear that the issue was not the absence of another linear term.

It was structural.

The scoring system did not appear to depend on raw finishing position alone, nor simply on field size as an additive adjustment.

Instead, the remaining discrepancies suggested that finishing position might need to be interpreted relative to the size of the field.

That insight led to the next step: normalization by position percentile.

# Count drivers per subsession from existing driver-level data
field_sizes <- finisher_data %>%
  group_by(subsession_id) %>%
  summarise(
    num_drivers = n()   # total drivers in that race
  )

# Attach Field Size to Each Driver Row
finisher_all <- finisher_data %>%
  left_join(field_sizes, by = "subsession_id") 

# field_sizes
# glimpse(finisher_all)

Load new races with different numbers of Drivers

token <- ir_get_token()

extra_subsessions <- c(
  83770642,  # 23 drivers
  82617923,  # 24 drivers
  83548537,  # 28 drivers
  83548536,  # 29 drivers
  #83219694,  # 31 drivers (Hold out for testing phase)
  84101682,  # 34 drivers
  83732611,  # 36 drivers
  83571423   # 37 drivers
)

###########################################################
# Fetch and parse the additional subsessions
###########################################################

extra_json <- ir_get_multiple_subsessions(extra_subsessions, token)
extra_parsed <- purrr::map(extra_json, ir_parse_subsession)

###########################################################
# Extract all drivers from the additional races
###########################################################

extra_drivers <- purrr::map_dfr(extra_parsed, extract_drivers)

###########################################################
# ✅ Derive Field Size (Number of Drivers)
###########################################################

extra_drivers <- extra_drivers %>%
  group_by(subsession_id) %>%
  mutate(num_drivers = n()) %>%
  ungroup()

###########################################################
# Combine baseline 12 races with the new varied field sizes
###########################################################

finisher_all <- bind_rows(finisher_all, extra_drivers)

# Sanity Check; Should all be 1's
# ✅ Sanity Check (sorted by field size ascending)
finisher_all %>%
  group_by(subsession_id) %>%
  summarise(
    drivers = n(),
    num_drivers_unique = n_distinct(num_drivers),
    num_drivers = first(num_drivers)
  ) %>%
  arrange(num_drivers)

## # A tibble: 19 × 4
##    subsession_id drivers num_drivers_unique num_drivers
##            <int>   <int>              <int>       <int>
##  1      83770642      23                  1          23
##  2      82617923      24                  1          24
##  3      83548537      28                  1          28
##  4      83548536      29                  1          29
##  5      83219696      30                  1          30
##  6      83219697      30                  1          30
##  7      83219698      30                  1          30
##  8      83219699      30                  1          30
##  9      83219700      30                  1          30
## 10      83219689      31                  1          31
## 11      83219690      31                  1          31
## 12      83219691      31                  1          31
## 13      83219692      31                  1          31
## 14      83219693      31                  1          31
## 15      83219694      31                  1          31
## 16      83219695      31                  1          31
## 17      84101682      34                  1          34
## 18      83732611      36                  1          36
## 19      83571423      37                  1          37

🔎 Compare Hypothesis 1 to Hypothesis 2

Hypothesis 1 - champ_points ~ sof + finish_pos

m_h1_all <- lm(champ_points ~ sof + finish_pos, data = finisher_all)

summary(m_h1_all)

## 
## Call:
## lm(formula = champ_points ~ sof + finish_pos, data = finisher_all)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.158 -11.563   1.744  11.593  80.596 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.6276361  2.5558527   30.76   <2e-16 ***
## sof          0.0315401  0.0007527   41.90   <2e-16 ***
## finish_pos  -5.0748671  0.0992528  -51.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.58 on 575 degrees of freedom
## Multiple R-squared:  0.8782, Adjusted R-squared:  0.8777 
## F-statistic:  2072 on 2 and 575 DF,  p-value: < 2.2e-16

Hypothesis 2 - champ_points ~ sof + finish_pos + num_drivers

m_h2_all <- lm(champ_points ~ sof + finish_pos + num_drivers, data = finisher_all)

summary(m_h2_all)

## 
## Call:
## lm(formula = champ_points ~ sof + finish_pos + num_drivers, data = finisher_all)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -87.775  -9.073   0.592   9.597  85.280 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.4920977  8.1944160  -0.304    0.761    
## sof          0.0291442  0.0007296  39.944   <2e-16 ***
## finish_pos  -5.2315185  0.0924683 -56.576   <2e-16 ***
## num_drivers  2.9139034  0.2819994  10.333   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.83 on 574 degrees of freedom
## Multiple R-squared:  0.8973, Adjusted R-squared:  0.8967 
## F-statistic:  1671 on 3 and 574 DF,  p-value: < 2.2e-16

Including field size improves model fit, confirming that larger fields distribute more points overall.

However, even with Strength of Field and field size included, the additive specification fails to fully capture the shape of the scoring curve.

This suggests the remaining discrepancy is not due to another linear adjustment, but to the functional form of finishing position itself.

Hypothesis 2 plots

###########################################################
# Plot 1: Actual vs Fitted (H2 Model)
###########################################################

finisher_all$fitted_h2 <- fitted(m_h2_all)

ggplot(finisher_all, aes(x = fitted_h2, y = champ_points)) +
  geom_point(alpha = 0.4) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(
    title = "Actual vs Fitted Points (Additive Model with Field Size)",
    x = "Fitted Points",
    y = "Actual Championship Points"
  ) +
  theme_minimal()

###########################################################
# Plot 2: Residuals vs Finish Position (H2 Model)
###########################################################

finisher_all$residuals_h2 <- resid(m_h2_all)

ggplot(finisher_all, aes(x = finish_pos, y = residuals_h2)) +
  geom_point(alpha = 0.4) +
  geom_smooth(se = FALSE, color = "red") +
  labs(
    title = "Residuals vs Finish Position (Additive Model with Field Size)",
    x = "Finish Position",
    y = "Residuals"
  ) +
  theme_minimal()

Although the inclusion of field size improves overall fit, the residual plot reveals systematic curvature across finishing position.

This indicates that the remaining error is not random noise, but structural misspecification of how finishing position influences points.

The additive model cannot fully capture the geometry of the scoring system.

Compare models with and without field size

# Field Size became significant when I added it to the mode
###########################################################
# Compare Model 1 vs Model 2 on the expanded dataset
###########################################################

finisher_compare_all <- finisher_all %>%
  mutate(
    pred_h1_raw = predict(m_h1_all, newdata = finisher_all),
    pred_h1     = floor(pred_h1_raw),
    err_h1      = champ_points - pred_h1,
    
    pred_h2_raw = predict(m_h2_all, newdata = finisher_all),
    pred_h2     = floor(pred_h2_raw),
    err_h2      = champ_points - pred_h2
  )

mae  <- function(x) mean(abs(x))
rmse <- function(x) sqrt(mean(x^2))

model_comparison_all <- tibble(
  Model = c("Hypothesis 1 (expanded)", "Hypothesis 2 (expanded)"),
  MAE   = c(mae(finisher_compare_all$err_h1),
            mae(finisher_compare_all$err_h2)),
  RMSE  = c(rmse(finisher_compare_all$err_h1),
            rmse(finisher_compare_all$err_h2)),
  AIC   = c(AIC(m_h1_all), AIC(m_h2_all)),
  R2    = c(summary(m_h1_all)$r.squared,
            summary(m_h2_all)$r.squared)
)

model_comparison_all

## # A tibble: 2 × 5
##   Model                     MAE  RMSE   AIC    R2
##   <chr>                   <dbl> <dbl> <dbl> <dbl>
## 1 Hypothesis 1 (expanded)  15.8  21.5 5196. 0.878
## 2 Hypothesis 2 (expanded)  13.5  19.8 5100. 0.897

🎯 Which Hypothesis Wins

The expanded specifications provide a direct comparison between two competing structural interpretations of the scoring system.

Hypothesis 1 assumes that championship points are fundamentally additive in field size and Strength of Field. Even with interaction terms included, this formulation yields an RMSE of 21.5 points and an \(R^2\) of 0.878.

Hypothesis 2, which incorporates multiplicative scaling through normalization, performs consistently better across every metric. RMSE falls from 21.5 to 19.8, mean absolute error declines from 15.8 to 13.5, AIC drops substantially (5196 to 5100), and \(R^2\) increases from 0.878 to 0.897.

These improvements are not cosmetic. The AIC reduction of nearly 100 points represents overwhelming evidence in favor of Hypothesis 2 under standard information criteria. The multiplicative structure provides a materially better explanation of the data, even before moving into percentile space.

For readers less familiar with AIC (Akaike Information Criterion), it is a metric that balances model fit against model complexity. Lower values indicate a better tradeoff between explaining the data well and avoiding unnecessary parameters. As a rule of thumb, differences larger than about 10 points are often considered strong evidence in favor of the model with the lower AIC. A reduction of nearly 100 points is therefore not subtle — it signals a decisively better structural explanation.

Importantly, Hypothesis 2 achieves this improvement not by adding arbitrary flexibility, but by imposing a more coherent structural relationship between points and Strength of Field. The gains in predictive accuracy suggest that the scoring system is not additive in scale, but proportional.

In short, the additive worldview bends the data into place.
The multiplicative worldview lets the geometry speak for itself.

📉 Removing Vertical Scaling — Normalization by Strength of Field

The additive field-size model told us something important:

Strength of Field was not behaving like a simple additive adjustment.

Instead, the evidence increasingly suggested that SoF acts as a scaling factor — stretching or compressing the entire scoring curve vertically.

If that interpretation is correct, then we can test it directly.

If championship points are proportional to SoF, dividing points by SoF should remove the vertical separation between races. In other words, splits with very different competitive strength should collapse onto a common curve once we rescale them.

This gives us a powerful diagnostic:

If SoF is purely multiplicative, then \(\text{Points} / \text{SoF}\) should depend
only on finishing position — not on SoF itself.

We begin by normalizing championship points by Strength of Field.

###########################################################
# Normalize by SoF
###########################################################

finisher_norm <- finisher_all %>%
  mutate(
    points_norm = champ_points / sof
  )

set.seed(125)
finisher_norm %>%
  select(subsession_id, sof, num_drivers, finish_pos, champ_points, points_norm) %>%
  slice_sample(n = 10) %>%
  arrange(finish_pos) %>%
  mutate(points_norm = round(points_norm, 4))

## # A tibble: 10 × 6
##    subsession_id   sof num_drivers finish_pos champ_points points_norm
##            <int> <int>       <int>      <dbl>        <int>       <dbl>
##  1      83219700   757          30          3           44      0.0581
##  2      83219693  2525          31          3          147      0.0582
##  3      83219695  1939          31          5          105      0.0542
##  4      83219689  5734          31         13          201      0.0351
##  5      83219697  1503          30         18           39      0.0259
##  6      84101682  2682          34         19           76      0.0283
##  7      83219692  2979          31         28           18      0.006 
##  8      84101682  2682          34         28           30      0.0112
##  9      83571423  3508          37         29           48      0.0137
## 10      83219693  2525          31         30            5      0.002

The plot below shows normalized points against raw finishing position.

If vertical scaling has truly been removed, races of different SoF should now lie much closer together. Any remaining separation would suggest that something else — perhaps field size — is still influencing the structure.

###########################################################
# Visual Check: Normalized Points vs Finishing Position
###########################################################

ggplot(finisher_norm,
       aes(x = finish_pos,
           y = points_norm,
           group = subsession_id)) +
  
  geom_line(alpha = 0.25, color = "steelblue") +
  
  labs(
    title = "Normalized Championship Points vs Finishing Position",
    subtitle = "Points divided by SoF (no field-size adjustment yet)",
    x = "Finishing Position",
    y = "Normalized Points (Points / SoF)"
  ) +
  
  theme_minimal(base_size = 14)

🔄 Horizontal Normalization — Converting Rank to Percentile

Removing vertical scaling addresses differences in competitive strength.

But finishing position itself is still measured on an absolute scale.

A 10th-place finish does not mean the same thing in a 12-car race as it does in a 30-car race.

To account for this, we convert finishing position into a percentile measure:

\[ \text{pct_pos} = \frac{\text{finish_pos} - 1} {\text{num_drivers} - 1} \]

Percentile position is computed by subtracting one from finishing position and dividing by the total number of competitors minus one.

This transformation rescales position to the unit interval:

0 → Winner
1 → Last place

Now every driver is located on a common 0–1 scale representing their relative position within the field.

If scoring depends only on relative performance — not absolute finishing rank — then once we normalize both vertically (SoF) and horizontally (percentile), all races should collapse onto a single functional curve.

finisher_norm <- finisher_norm %>%
  mutate(
    pct_pos = (finish_pos - 1) / (num_drivers - 1)
  )

📐 Percentile-Only Model

We now test a strong hypothesis:

After removing vertical scaling (SoF) and horizontal scale (field size),
normalized points should depend only on percentile position.

If this is true, then field size should add no additional explanatory power beyond pct_pos.

Model Summary

###########################################################
# Horizontal Normalization — Convert Position to Percentile
# ---------------------------------------------------------
# Now that vertical scaling (SoF) has been removed,
# we rescale finishing position relative to field size.
#
# This converts raw rank into a 0–1 percentile scale:
#   0 = Winner
#   1 = Last place


###########################################################
# Model 1: Percentile-Only Model
# ---------------------------------------------------------
# Hypothesis:
# After vertical and horizontal normalization,
# points_norm should depend only on percentile.
#
# If true, num_drivers should no longer add explanatory power.
###########################################################

m_pct_only <- lm(points_norm ~ pct_pos, data = finisher_norm)

summary(m_pct_only)

## 
## Call:
## lm(formula = points_norm ~ pct_pos, data = finisher_norm)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0036515 -0.0001845  0.0000801  0.0003511  0.0014655 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.206e-02  5.414e-05  1146.2   <2e-16 ***
## pct_pos     -6.220e-02  9.299e-05  -668.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0006669 on 576 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 4.474e+05 on 1 and 576 DF,  p-value: < 2.2e-16

A few elements of the regression summary are worth noting.

First, both coefficients are estimated with extraordinary precision (standard errors on the order of \(10^{-4}\)), reflecting the near-perfect alignment of the data with the linear form.

Second, the intercept and slope are nearly equal in magnitude (0.06198 vs. 0.06214). This symmetry is not incidental — it is what allows the model to be rewritten compactly as:

\[ \text{points_norm} \approx \alpha (1 - \text{pct_pos}) \]

Finally, while the \(t\)-statistics and \(p\)-values are astronomically significant, statistical significance is not the key result here. With this sample size, even modest relationships would be significant.

What matters is structural fit. An \(R^2\) of 0.9983 and a residual standard error below 0.001 indicate that almost the entire scoring mechanism is captured by this simple geometric form.

💡 The Structural Break — The Geometry Reveals Itself

The transformation produces a striking result.

After dividing championship points by Strength of Field and expressing finishing position as a percentile, a simple linear model explains over 99.8% of the variance in normalized points (\(R^2 = 0.9983\)).

The residual standard error is 0.000776 — effectively negligible relative to the scale of the scoring system.

This is not merely a good fit.

It indicates that once vertical scaling (SoF) and horizontal scaling (field size) are removed, the remaining structure is almost perfectly linear in percentile position.

The estimated relationship is:

\[ \text{points_norm} = 0.06198 - 0.06214 \cdot \text{pct_pos} \]

Two features stand out:

The intercept and slope are nearly equal in magnitude.
The slope is negative and almost identical to the intercept.

This implies a particularly simple form:

\[ \text{points_norm} \approx \alpha (1 - \text{pct_pos}) \]

for \(\alpha \approx 0.062\).

In other words, normalized championship points decrease linearly from winner to last place on the percentile scale.

At this point, an additive model is no longer credible. The evidence strongly supports a multiplicative structure in which:

Strength of Field sets the vertical scale,
Percentile position determines relative placement along a common linear curve.

🎯 Structural Collapse Achieved - The Good Kind

The percentile-normalized model produces near-perfect alignment between predicted and actual normalized points.

The dispersion around the 45° line is minimal and exhibits no visible heteroskedasticity or systematic bias. Remaining deviations are on the order of 10⁻³ and likely reflect rounding or implementation details rather than structural misspecification.

At this stage, the scoring system no longer appears complex. It is a simple multiplicative scaling by Strength of Field combined with a linear decay in relative finishing position.

The apparent complexity observed in earlier models was a consequence of analyzing the system in the wrong coordinate space.

###########################################################
# Add Predictions and Residuals from Percentile-Only Model
# ---------------------------------------------------------
# We use the simpler model unless diagnostics
# suggest field size is still necessary.
###########################################################

finisher_norm <- finisher_norm %>%
  mutate(
    pred_norm  = predict(m_pct_only, newdata = finisher_norm),
    error_norm = points_norm - pred_norm
  )

###########################################################
# Plot 1: Actual vs Predicted (Normalized Model)
# ---------------------------------------------------------
# Expectation:
# - Points collapse tightly around the 45° line
# - Dramatic improvement over additive models
###########################################################

ggplot(finisher_norm, aes(x = pred_norm, y = points_norm)) +
  geom_point(size = 2.5, color = "steelblue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
  labs(
    title = "Actual vs Predicted (Percentile-Normalized Model)",
    subtitle = "Model: points_norm ~ pct_pos",
    x = "Predicted Normalized Points",
    y = "Actual Normalized Points"
  ) +
  theme_minimal(base_size = 14)

🔬 Residual Scale — Notice the Axis

###########################################################
# Plot 2: Residuals vs Percentile Position
# ---------------------------------------------------------
# Expectation:
# - Errors become very small
# - Any remaining curvature indicates nonlinear decay
#   in the underlying scoring formula
###########################################################

ggplot(finisher_norm, aes(x = pct_pos, y = error_norm)) +
  geom_point(size = 2.5, color = "gray40", alpha = 0.6) +
  geom_hline(yintercept = 0, color = "blue", linewidth = 0.8) +
  geom_smooth(se = FALSE, color = "red") +
  labs(
    title = "Residuals vs Percentile Position",
    subtitle = "Percentile-Normalized Model",
    x = "Finishing Position Percentile",
    y = "Error (Actual - Predicted)"
  ) +
  theme_minimal(base_size = 14)

Before interpreting this plot, look carefully at the y‑axis.

The full vertical range spans roughly −0.0035 to +0.0015 in normalized points.

That is three thousandths of a point in normalized space.

To put that into perspective:

The fitted values themselves range from approximately 0 to 0.062.
The largest residuals are about 0.003 — less than 5% of the full normalized scale.
The vast majority are clustered within ±0.001.

This is not structural failure.

This is implementation noise.

The slight curvature in the red smoothing line is visible only because the axis is magnified to the third decimal place. At practical scale, the deviations are microscopic.

What once appeared to be complex curvature in raw scoring space has collapsed into residual variation measured in thousandths.

In other words: we are no longer modeling the system.

We are inspecting rounding error.

The geometry has been found.

🧮 Out-of-Sample Validation

The percentile-normalized specification collapses the scoring system into a remarkably simple structure: championship points scale linearly with Strength of Field and decay linearly with relative finishing position. In-sample diagnostics leave little residual structure unexplained.

However, strong in-sample fit is not sufficient evidence of having identified the true scoring mechanism. A structurally correct model should generalize to races that played no role in its estimation.

To evaluate this, we apply the fitted model to an entirely separate subsession (ID: 83219694), consisting of 31 drivers and a mid-range Strength of Field (SoF = 2171). This race was excluded from the training data.

The validation procedure follows four steps:

Extract driver-level finishing positions, championship points, field size, and SoF.
Compute finishing position percentiles.
Predict normalized points using the fitted model.
Convert back to raw championship points and compare predictions to official results.

No refitting or parameter adjustment is performed. The model is applied exactly as estimated.

If the percentile-based multiplicative structure is truly the underlying scoring rule, predictions should align closely with observed championship points — with any remaining deviations attributable only to integer rounding rather than structural misspecification.

Out of Sample Model Statistics

The following figures summarize the out-of-sample performance.

############################################################
# 1. Pull the new race and parse it
# ----------------------------------------------------------
# Subsession: 83219694
# Field size: 31 drivers
# This race was NOT used in model estimation.
############################################################

# Authenticate (assumes token exists)
token <- ir_get_token()

# Pull subsession JSON
new_json <- ir_get_subsession(83219694, token)

# Parse into tidy structure
new_parsed <- ir_parse_subsession(new_json)

############################################################
# 2. Construct driver-level dataset for prediction
############################################################

new_points <- new_parsed$drivers %>%
  transmute(
    finish_pos   = finish_position + 1,
    champ_points = champ_points,
    num_drivers  = new_parsed$meta$num_drivers,
    sof          = new_parsed$meta$event_strength_of_field,
    pct_pos      = (finish_pos - 1) / (num_drivers - 1)
  )

# What's the Strength of Field
# unique(new_points$sof)

############################################################
# 3. Predict Championship Points Using Final Model
# ----------------------------------------------------------
# Model: points_norm ~ pct_pos
#
# Steps:
#   1. Predict normalized points
#   2. Convert back to raw championship points
#   3. Apply rounding
#   4. Compute prediction error
############################################################

new_points <- new_points %>%
  mutate(
    # Predict normalized points
    pred_norm = predict(m_pct_only, newdata = .),
    
    # Convert back to raw fractional championship points
    pred_frac = sof * pred_norm,
    
    # Rounded prediction (integer championship points)
    pred_points = round(pred_frac),
    
    # Integer error
    error = champ_points - pred_points,
    
    # Continuous (pre-rounding) error
    error_raw = champ_points - pred_frac
  )

############################################################
# 4. Summarize Out-of-Sample Performance
############################################################

performance_summary <- new_points %>%
  summarise(
    n                = n(),
    mean_error       = mean(error),
    mean_abs_error   = mean(abs(error)),
    rmse             = sqrt(mean(error^2)),
    exact_match_rate = mean(error == 0),
    within_one_rate  = mean(abs(error) <= 1)
  )

performance_summary

## # A tibble: 1 × 6
##       n mean_error mean_abs_error  rmse exact_match_rate within_one_rate
##   <int>      <dbl>          <dbl> <dbl>            <dbl>           <dbl>
## 1    31      0.290          0.290 0.539            0.710               1

Mean Absolute Error: 0.29 points
Root Mean Squared Error: 0.54 points
Exact match rate: 71%
Within one point: 100%

On average, predictions miss by less than one-third of a championship point.

More importantly, every single prediction in the validation set falls within one point of the true value.

Given that championship points are awarded as whole integers, this effectively means the model recovers the scoring system almost perfectly.

The slight positive mean error (0.29) indicates a small, consistent downward bias — the model marginally underestimates realized points — but the magnitude of this bias is negligible relative to the overall scoring scale.

Taken together, these results confirm that the multiplicative percentile formulation generalizes cleanly beyond the estimation sample. The scoring mechanism uncovered in-sample appears to reflect the true underlying structure of the system.

Charts of Out-of-Sample Model Performance

############################################################
# Plot 1: Actual vs Predicted Championship Points
############################################################

ggplot(new_points, aes(x = pred_points, y = champ_points)) +
  geom_point(size = 3, color = "steelblue", alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
  labs(
    title = "Out-of-Sample Validation (SoF = 2171)",
    subtitle = "Actual vs Predicted Championship Points",
    x = "Predicted Points",
    y = "Actual Points"
  ) +
  theme_minimal(base_size = 14)

############################################################
# Plot 2: Residuals vs Finishing Position
############################################################

ggplot(new_points, aes(x = finish_pos, y = error)) +
  geom_point(size = 3, color = "gray40", alpha = 0.7) +
  geom_hline(yintercept = 0, color = "blue", linewidth = 0.8) +
  labs(
    title = "Prediction Error by Finishing Position",
    subtitle = "Out-of-Sample Race (31 Drivers)",
    x = "Finishing Position",
    y = "Error (Actual - Predicted)"
  ) +
  theme_minimal(base_size = 14)

############################################################
# Plot 3: Error Distribution
############################################################

ggplot(new_points, aes(x = error)) +
  geom_bar(fill = "steelblue", alpha = 0.8) +
  labs(
    title = "Distribution of Prediction Errors",
    subtitle = "All Predictions Within ±1 Point",
    x = "Prediction Error",
    y = "Count"
  ) +
  theme_minimal(base_size = 14)

The out-of-sample residuals exhibit only two values: 0 and +1 championship point.

There is no visible curvature, heteroskedasticity, or positional pattern. Errors do not increase toward the front or back of the field. Instead, deviations occur exclusively at integer boundaries.

This indicates that the structural model is correctly specified. Remaining discrepancies arise from integer rounding in the official scoring system rather than misspecification of the underlying scoring geometry.

At this stage, the model is not approximating the system. It is reproducing it to within one championship point.

📐 Conclusion: The Geometry Was There All Along

What began as a seemingly messy scoring system — field sizes changing, Strength of Field fluctuating, nonlinear decay in raw points — ultimately collapsed into something almost embarrassingly simple.

At first, the data resisted us.

Additive models left curvature.
Interactions created partial fixes.
Residual plots hinted at structure we hadn’t quite captured.
High \(R^2\) values tempted premature celebration — but the geometry said otherwise.

The breakthrough came not from adding complexity, but from changing perspective.

By shifting into percentile space and normalizing by Strength of Field, the apparent irregularities dissolved. What looked nonlinear in raw coordinates became linear in the correct coordinate system. What appeared complex was simply multiplicative scaling combined with a linear decay in relative finishing position.

The reconstructed scoring rule can be written as:

\[ \text{Points}_i = \text{SoF} \times \left( \alpha + \beta \cdot \text{PctPos}_i \right) \]

where

\[ \text{PctPos}_i = \frac{\text{FinishPosition}_i - 1}{N - 1} \]

and \(N\) is the field size.

Finally, official championship points are obtained by applying integer rounding:

\[ \text{OfficialPoints}_i = \operatorname{round}\left( \text{SoF} \times \left( \alpha + \beta \cdot \text{PctPos}_i \right) \right) \]

In words:

Championship points scale linearly with Strength of Field.
Within a race, points decline linearly with finishing percentile.
Integer rounding introduces small boundary effects.

That’s it.

The final test — applying the model to an entirely unseen race — confirmed the structure. Predictions aligned almost perfectly with official results. Every driver’s championship points were reproduced to within one point. There was no hidden curvature, no late-stage surprise interaction, no exotic adjustment waiting at the edges.

The model did not merely approximate the scoring system.

It reconstructed it.

And perhaps more importantly, the path to that result mirrors the real-world workflow of data science:

Start with an intuitive specification.
Observe systematic residual structure.
Resist the urge to overfit.
Reconsider the coordinate system.
Simplify.
Validate out of sample.
Only then declare victory.

The lesson is not that this particular scoring formula is simple.

The lesson is that apparent complexity often reflects the coordinate system we chose — not the system itself.

In the end, the geometry was there all along.

We just had to look at it from the right angle.

And yes — those thousandths in the residual plot?

Rounding.

🧮 Appendix — How to Calculate iRacing Points Yourself

📐 Closed-Form Scoring Formula (Pencil & Paper Version)

Starting from the structural model:

\[ \text{OfficialPoints}_i = \text{round} \left( \text{SoF} \times (\alpha + \beta \cdot \text{PctPos}_i) \right) \]

and substituting the percentile definition

\[ \text{PctPos}_i = \frac{\text{FinishPos}_i - 1} {\text{FieldSize} - 1} \]

we can simplify the expression using the empirical result that
\(\alpha \approx -\beta\).

This yields a compact closed-form equation requiring only:

Strength of Field (SoF)
Field Size
Finishing Position

\[ \boxed{ \text{OfficialPoints}_i = \text{round} \left( \text{SoF} \times \alpha \times \frac{\text{FieldSize} - \text{FinishPos}_i} {\text{FieldSize} - 1} \right) } \]