About

This is a notebook to accomplany this blogpost about using OLS with bounded outcome variables.

The data are from Table 3.6 in Lynn and Vanhanen’s 2012 book Intelligence: A unifying construct for the social sciences.

Init

options(digits = 2)
library(pacman)
p_load(kirkegaard, readxl, rms, betareg)
theme_set(theme_bw())

#logits to probability
#https://sebastiansauer.github.io/convert_logit2prob/
logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}

Data

LV = read_excel("data/LV12 data.xlsx") %>% 
  df_legalize_names()

OLS as in the book

Get a similar plot to the one in LV book.

#plot
GG_scatter(LV, "national_IQ", "tertiary_2009", case_names = "country")

## `geom_smooth()` using formula 'y ~ x'

#OLS fit
(ols_fit = ols(tertiary_2009 ~ national_IQ, data = LV))

## Frequencies of Missing Values Due to Each Variable
## tertiary_2009   national_IQ 
##             7             0 
## 
## Linear Regression Model
##  
##  ols(formula = tertiary_2009 ~ national_IQ, data = LV)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      192    LR chi2    169.89    R2       0.587    
##  sigma16.3755    d.f.            1    R2 adj   0.585    
##  d.f.     190    Pr(> chi2) 0.0000    g       22.391    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -45.683 -11.049  -1.525   9.150  49.859 
##  
##  
##              Coef      S.E.   t      Pr(>|t|)
##  Intercept   -119.9170 9.1677 -13.08 <0.0001 
##  national_IQ    1.7732 0.1079  16.44 <0.0001 
##

#impossible values in original
LV %>% filter(LV_fitted < 0 | LV_fitted > 100) %>% arrange(-national_IQ)

#replication values
ols_fit_values = tibble(
  national_IQ = LV$national_IQ,
  fitted = predict(ols_fit)
)

#print again
ols_fit_values %>% filter(fitted < 0 | fitted > 100) %>% arrange(-national_IQ)

Nonlinear fits

We try simplest stuff, and custom chosen.

#when predicting, we need these indexes
na_cases = (is.na(LV$national_IQ) | is.na(LV$tertiary_2009))

#scale to 0-1
LV$tertiary_2009_unit = LV$tertiary_2009 / 100
LV$tertiary_2009_unit_winsorise = LV$tertiary_2009_unit %>% winsorise(upper = .99, lower = .01)

#winsorize to 1 and 99
LV$tertiary_2009_winsorise = LV$tertiary_2009 %>% winsorise(upper = 99, lower = 1)

#log ols
(log_ols_fit = ols(log(tertiary_2009_winsorise) ~ national_IQ, data = LV))

## Frequencies of Missing Values Due to Each Variable
## log(tertiary_2009_winsorise)                  national_IQ 
##                            7                            0 
## 
## Linear Regression Model
##  
##  ols(formula = log(tertiary_2009_winsorise) ~ national_IQ, data = LV)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     192    LR chi2    167.78    R2       0.583    
##  sigma0.7590    d.f.            1    R2 adj   0.580    
##  d.f.    190    Pr(> chi2) 0.0000    g        1.028    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.46743 -0.45395  0.05274  0.46620  1.85971 
##  
##  
##              Coef    S.E.   t     Pr(>|t|)
##  Intercept   -3.9983 0.4249 -9.41 <0.0001 
##  national_IQ  0.0814 0.0050 16.29 <0.0001 
##

#fit spline
(spline_fit = ols(tertiary_2009 ~ rcs(national_IQ), data = LV))

## Frequencies of Missing Values Due to Each Variable
## tertiary_2009   national_IQ 
##             7             0 
## 
## Linear Regression Model
##  
##  ols(formula = tertiary_2009 ~ rcs(national_IQ), data = LV)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      192    LR chi2    196.16    R2       0.640    
##  sigma15.4148    d.f.            4    R2 adj   0.632    
##  d.f.     187    Pr(> chi2) 0.0000    g       22.801    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -48.683  -7.332  -1.980   7.854  57.705 
##  
##  
##                 Coef     S.E.    t     Pr(>|t|)
##  Intercept      -37.6976 40.6879 -0.93 0.3554  
##  national_IQ      0.6516  0.5800  1.12 0.2627  
##  national_IQ'     0.1330  2.6801  0.05 0.9605  
##  national_IQ''   13.6901 10.9076  1.26 0.2110  
##  national_IQ''' -44.4484 17.7928 -2.50 0.0133  
##

#loess fit
(loess_fit = loess(tertiary_2009 ~ national_IQ, data = LV))

## Call:
## loess(formula = tertiary_2009 ~ national_IQ, data = LV)
## 
## Number of Observations: 192 
## Equivalent Number of Parameters: 4.7 
## Residual Standard Error: 15

#beta fit
#this assumes scaled data, 0-1
#gives misleading error when some data are 0 or 1
(beta_fit = betareg(tertiary_2009_unit_winsorise ~ national_IQ, data = LV))

## 
## Call:
## betareg(formula = tertiary_2009_unit_winsorise ~ national_IQ, data = LV)
## 
## Coefficients (mean model with logit link):
## (Intercept)  national_IQ  
##     -8.4568       0.0886  
## 
## Phi coefficients (precision model with identity link):
## (phi)  
##  6.57

#quasibinomial
#produces nonsense results
(quasibinomial_fit = glm(tertiary_2009_unit ~ national_IQ, data = LV, family = quasibinomial(logit)))

## 
## Call:  glm(formula = tertiary_2009_unit ~ national_IQ, family = quasibinomial(logit), 
##     data = LV)
## 
## Coefficients:
## (Intercept)  national_IQ  
##      -9.761        0.103  
## 
## Degrees of Freedom: 191 Total (i.e. Null);  190 Residual
##   (7 observations deleted due to missingness)
## Null Deviance:       63 
## Residual Deviance: 25    AIC: NA

#Weibull
#gave up
#these nls functions are straight out retarded
#the GUI software I used in high school 10 years ago are better
#put in data, click fit to [any variety of function types], get results
#in R, have to waste time guessing at start params
#and since these are not intuitive, it's a PINA
#???
#why has no one fixed this
#weibull_fit <- nls(tertiary_2009 ~ SSweibull(LV$national_IQ, Asym = 100, Drop = 100,lrc =  -5,pwr = 2), data = LV)

#SSasymp
#  singular gradient
#gave up
#these functions suck ass
#fit <- nls(tertiary_2009 ~ SSasymp(national_IQ, yf, y0, log_alpha), data = LV)

#fit comparison
IQ_range = seq(min(LV$national_IQ, na.rm = T), max(LV$national_IQ, na.rm = T))
new_fits = tibble(
  national_IQ = IQ_range
)

#insert predictions
new_fits$ols = predict(ols_fit, newdata = new_fits)
new_fits$log_ols = predict(log_ols_fit, newdata = new_fits) %>% exp() #convert back using exp()
new_fits$spline = predict(spline_fit, newdata = new_fits)
new_fits$loess = predict(loess_fit, newdata = new_fits)
new_fits$beta = predict(beta_fit, newdata = new_fits) * 100 #get back to 0-100 format
new_fits$quasibinomial = predict(quasibinomial_fit, newdata = new_fits) %>% logit2prob() %>% multiply_by(100) #convert logits to 0-1, then to 0-100

#plot combined
new_fits %>% 
  gather(key = fit, value = value, ols:quasibinomial) %>% 
  ggplot() +
  geom_line(aes(national_IQ, value, color = fit)) +
  geom_point(data = LV, aes(national_IQ, tertiary_2009)) +
  scale_y_continuous("Tertiary enrollment in 2009, %", breaks = seq(0, 100, by = 10)) +
  scale_x_continuous("National IQ") +
  ggtitle("Model fit comparison")

## Warning: Removed 7 rows containing missing values (geom_point).

GG_save("figs/model_fits.png")

## Warning: Removed 7 rows containing missing values (geom_point).

#values
new_fits

Meta

write_sessioninfo()