About

TL;DR, you can’t just convert centiles at 1:1 ratio. Assume a .80 true correlation, free from measurement error and range problems, so we are looking at latent scores here.

I did not bother with the scales, since these don’t matter for the point being made. I also did not bother with scale ranges, but these just make my point stronger since they cause problems near the ceiling.

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(rms)

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

theme_set(theme_bw())

#sim data
set.seed(1)
d = MASS::mvrnorm(n = 10e3,
                  Sigma = matrix(c(1, .8, .8, 1), nrow = 2),
                  mu = c(0, 0)) %>% 
  set_colnames(c("X", "Y")) %>% 
  as_tibble() %>% 
  mutate(
    X_centile = ecdf(X)(X),
    Y_centile = ecdf(Y)(Y)
  )

#plot
#red line: slope = 1
ggplot(d, aes(X, Y)) + 
  geom_point() + 
  geom_smooth() + 
  geom_abline(intercept = 0, slope = 1, color = "red") +
  ggtitle("Centiles of X and Y, which are r = .80\nRed line: slope = 1")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#centiles
ggplot(d, aes(X_centile, Y_centile)) + 
  geom_point() + 
  geom_smooth() + 
  geom_abline(intercept = 0, slope = 1, color = "red") + 
  ggtitle("Centiles of X and Y, which are r = .80\nRed line: slope = 1") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#predicted values
preds = tibble(
  X = seq(1, 3, by = .1),
  X_centile = ecdf(d$X)(X)
)

#add predictions
#z score model
mod_Y = ols(Y ~ X, data = d)
mod_Y

## Linear Regression Model
##  
##  ols(formula = Y ~ X, data = d)
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs   10000    LR chi2    10563.05    R2       0.652    
##  sigma0.5966    d.f.              1    R2 adj   0.652    
##  d.f.   9998    Pr(> chi2)   0.0000    g        0.924    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -2.636473 -0.401585 -0.003439  0.396455  2.352340 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept -0.0036 0.0060  -0.60 0.5488  
##  X          0.8099 0.0059 136.94 <0.0001 
##

#centile model: DO NOT USE IRL
mod_Y_centile = ols(Y_centile ~ X_centile, data = d)
mod_Y_centile

## Linear Regression Model
##  
##  ols(formula = Y_centile ~ X_centile, data = d)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs   10000    LR chi2    9867.72    R2       0.627    
##  sigma0.1763    d.f.             1    R2 adj   0.627    
##  d.f.   9998    Pr(> chi2)  0.0000    g        0.264    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.6684154 -0.1181346 -0.0004279  0.1151591  0.6701526 
##  
##  
##            Coef   S.E.   t      Pr(>|t|)
##  Intercept 0.1040 0.0035  29.50 <0.0001 
##  X_centile 0.7920 0.0061 129.70 <0.0001 
##

#the predictions
preds$pred_Y = predict(mod_Y, newdata = preds)
preds$pred_Y_implied_centile = ecdf(d$Y)(preds$pred_Y)
preds$pred_Y_centile = predict(mod_Y_centile, newdata = preds)

#print results
preds %>% print(n=Inf)

## # A tibble: 21 x 5
##        X X_centile pred_Y pred_Y_implied_centile pred_Y_centile
##    <dbl>     <dbl>  <dbl>                  <dbl>          <dbl>
##  1   1       0.838  0.806                  0.790          0.767
##  2   1.1     0.860  0.887                  0.812          0.786
##  3   1.2     0.882  0.968                  0.833          0.802
##  4   1.3     0.900  1.05                   0.850          0.817
##  5   1.4     0.915  1.13                   0.868          0.829
##  6   1.5     0.929  1.21                   0.885          0.840
##  7   1.6     0.942  1.29                   0.901          0.850
##  8   1.7     0.955  1.37                   0.916          0.860
##  9   1.8     0.963  1.45                   0.928          0.867
## 10   1.9     0.971  1.54                   0.937          0.873
## 11   2       0.978  1.62                   0.946          0.878
## 12   2.1     0.983  1.70                   0.956          0.882
## 13   2.2     0.988  1.78                   0.962          0.886
## 14   2.3     0.991  1.86                   0.967          0.889
## 15   2.4     0.994  1.94                   0.972          0.891
## 16   2.5     0.996  2.02                   0.976          0.893
## 17   2.6     0.996  2.10                   0.980          0.893
## 18   2.7     0.997  2.18                   0.983          0.894
## 19   2.8     0.998  2.26                   0.986          0.894
## 20   2.9     0.999  2.35                   0.990          0.895
## 21   3       0.999  2.43                   0.992          0.895

Conclusions:

Never use centiles themselves to predict. They are nonlinear transformation of the underlying linear variable. Convert back to z scores before prediction.
The slopes are less than 1, a SAT-like score will not predict IQ scores at same centile or z score.

IQ score from SAT-like scores: simulation results

About