About

Sometimes people do statistics with centiles instead of the IQ scores. This is not a good idea because centiles are nonlinear transformation of IQ scores. These are NOT interval scores, and this results in biases when doing operations that assume intervals, such as… pretty much anything. Here we look at taking the mean score of some selected group, e.g. phd students, after recruitment based on some test. Using the centiles results in a slight downwards bias. This is probably too small to care about in most applications.

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:purrr':
## 
##     keep
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
## Loading required package: mice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Loading required package: assertthat
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: metafor
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading 'metafor' package (version 2.4-0). For an overview 
## and introduction to the package please type: help(metafor).
## 
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
## 
##     rescale
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## The following objects are masked from 'package:purrr':
## 
##     is_logical, is_numeric
## The following object is masked from 'package:base':
## 
##     +
library(patchwork)

#make data
set.seed(1)
d = tibble(
  IQ = rnorm(10000, mean = 100, sd = 15),
  IQ_centile = pnorm(IQ, mean = 100, sd = 15)
)

#compute means above threshold
means = map_df(seq(70, 130, by = 1), function(x) {
  d %>% filter(IQ >= x) %>% describe2() -> sums
  
  tibble(
    IQ_threshold = x,
    IQ_mean = sums$mean[1],
    IQ_centile_mean = sums$mean[2],
    derived_centile = pnorm(IQ_mean, mean = 100, sd = 15),
    derived_mean = qnorm(IQ_centile_mean, mean = 100, sd = 15)
  )
})

#plot
means %>% 
  pivot_longer(cols = c(IQ_mean, derived_mean)) %>% 
  ggplot(aes(IQ_threshold, value, color = name)) + 
  geom_line() +
  ylab("Mean after IQ threshold selection") +
  theme_bw() -> p1

means %>% 
  pivot_longer(cols = c(IQ_centile_mean, derived_centile)) %>% 
  ggplot(aes(IQ_threshold, value, color = name)) + 
  geom_line() +
  ylab("Mean after IQ threshold selection") +
  theme_bw() -> p2

p1 / p2