About

This is a notebook to accomplany this blogpost about using OLS with bounded outcome variables.

The data are from Table 3.6 in Lynn and Vanhanen’s 2012 book Intelligence: A unifying construct for the social sciences.

Init

options(digits = 2)
library(pacman)
p_load(kirkegaard, readxl, rms, betareg)
theme_set(theme_bw())

#logits to probability
#https://sebastiansauer.github.io/convert_logit2prob/
logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}

Data

LV = read_excel("data/LV12 data.xlsx") %>% 
  df_legalize_names()

OLS as in the book

Get a similar plot to the one in LV book.

#plot
GG_scatter(LV, "national_IQ", "tertiary_2009", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'

#OLS fit
(ols_fit = ols(tertiary_2009 ~ national_IQ, data = LV))
## Frequencies of Missing Values Due to Each Variable
## tertiary_2009   national_IQ 
##             7             0 
## 
## Linear Regression Model
##  
##  ols(formula = tertiary_2009 ~ national_IQ, data = LV)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      192    LR chi2    169.89    R2       0.587    
##  sigma16.3755    d.f.            1    R2 adj   0.585    
##  d.f.     190    Pr(> chi2) 0.0000    g       22.391    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -45.683 -11.049  -1.525   9.150  49.859 
##  
##  
##              Coef      S.E.   t      Pr(>|t|)
##  Intercept   -119.9170 9.1677 -13.08 <0.0001 
##  national_IQ    1.7732 0.1079  16.44 <0.0001 
## 
#impossible values in original
LV %>% filter(LV_fitted < 0 | LV_fitted > 100) %>% arrange(-national_IQ)
#replication values
ols_fit_values = tibble(
  national_IQ = LV$national_IQ,
  fitted = predict(ols_fit)
)

#print again
ols_fit_values %>% filter(fitted < 0 | fitted > 100) %>% arrange(-national_IQ)

Nonlinear fits

We try simplest stuff, and custom chosen.

#when predicting, we need these indexes
na_cases = (is.na(LV$national_IQ) | is.na(LV$tertiary_2009))

#scale to 0-1
LV$tertiary_2009_unit = LV$tertiary_2009 / 100
LV$tertiary_2009_unit_winsorise = LV$tertiary_2009_unit %>% winsorise(upper = .99, lower = .01)

#winsorize to 1 and 99
LV$tertiary_2009_winsorise = LV$tertiary_2009 %>% winsorise(upper = 99, lower = 1)

#log ols
(log_ols_fit = ols(log(tertiary_2009_winsorise) ~ national_IQ, data = LV))
## Frequencies of Missing Values Due to Each Variable
## log(tertiary_2009_winsorise)                  national_IQ 
##                            7                            0 
## 
## Linear Regression Model
##  
##  ols(formula = log(tertiary_2009_winsorise) ~ national_IQ, data = LV)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     192    LR chi2    167.78    R2       0.583    
##  sigma0.7590    d.f.            1    R2 adj   0.580    
##  d.f.    190    Pr(> chi2) 0.0000    g        1.028    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.46743 -0.45395  0.05274  0.46620  1.85971 
##  
##  
##              Coef    S.E.   t     Pr(>|t|)
##  Intercept   -3.9983 0.4249 -9.41 <0.0001 
##  national_IQ  0.0814 0.0050 16.29 <0.0001 
## 
#fit spline
(spline_fit = ols(tertiary_2009 ~ rcs(national_IQ), data = LV))
## Frequencies of Missing Values Due to Each Variable
## tertiary_2009   national_IQ 
##             7             0 
## 
## Linear Regression Model
##  
##  ols(formula = tertiary_2009 ~ rcs(national_IQ), data = LV)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      192    LR chi2    196.16    R2       0.640    
##  sigma15.4148    d.f.            4    R2 adj   0.632    
##  d.f.     187    Pr(> chi2) 0.0000    g       22.801    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -48.683  -7.332  -1.980   7.854  57.705 
##  
##  
##                 Coef     S.E.    t     Pr(>|t|)
##  Intercept      -37.6976 40.6879 -0.93 0.3554  
##  national_IQ      0.6516  0.5800  1.12 0.2627  
##  national_IQ'     0.1330  2.6801  0.05 0.9605  
##  national_IQ''   13.6901 10.9076  1.26 0.2110  
##  national_IQ''' -44.4484 17.7928 -2.50 0.0133  
## 
#loess fit
(loess_fit = loess(tertiary_2009 ~ national_IQ, data = LV))
## Call:
## loess(formula = tertiary_2009 ~ national_IQ, data = LV)
## 
## Number of Observations: 192 
## Equivalent Number of Parameters: 4.7 
## Residual Standard Error: 15
#beta fit
#this assumes scaled data, 0-1
#gives misleading error when some data are 0 or 1
(beta_fit = betareg(tertiary_2009_unit_winsorise ~ national_IQ, data = LV))
## 
## Call:
## betareg(formula = tertiary_2009_unit_winsorise ~ national_IQ, data = LV)
## 
## Coefficients (mean model with logit link):
## (Intercept)  national_IQ  
##     -8.4568       0.0886  
## 
## Phi coefficients (precision model with identity link):
## (phi)  
##  6.57
#quasibinomial
#produces nonsense results
(quasibinomial_fit = glm(tertiary_2009_unit ~ national_IQ, data = LV, family = quasibinomial(logit)))
## 
## Call:  glm(formula = tertiary_2009_unit ~ national_IQ, family = quasibinomial(logit), 
##     data = LV)
## 
## Coefficients:
## (Intercept)  national_IQ  
##      -9.761        0.103  
## 
## Degrees of Freedom: 191 Total (i.e. Null);  190 Residual
##   (7 observations deleted due to missingness)
## Null Deviance:       63 
## Residual Deviance: 25    AIC: NA
#Weibull
#gave up
#these nls functions are straight out retarded
#the GUI software I used in high school 10 years ago are better
#put in data, click fit to [any variety of function types], get results
#in R, have to waste time guessing at start params
#and since these are not intuitive, it's a PINA
#???
#why has no one fixed this
#weibull_fit <- nls(tertiary_2009 ~ SSweibull(LV$national_IQ, Asym = 100, Drop = 100,lrc =  -5,pwr = 2), data = LV)

#SSasymp
#  singular gradient
#gave up
#these functions suck ass
#fit <- nls(tertiary_2009 ~ SSasymp(national_IQ, yf, y0, log_alpha), data = LV)

#fit comparison
IQ_range = seq(min(LV$national_IQ, na.rm = T), max(LV$national_IQ, na.rm = T))
new_fits = tibble(
  national_IQ = IQ_range
)

#insert predictions
new_fits$ols = predict(ols_fit, newdata = new_fits)
new_fits$log_ols = predict(log_ols_fit, newdata = new_fits) %>% exp() #convert back using exp()
new_fits$spline = predict(spline_fit, newdata = new_fits)
new_fits$loess = predict(loess_fit, newdata = new_fits)
new_fits$beta = predict(beta_fit, newdata = new_fits) * 100 #get back to 0-100 format
new_fits$quasibinomial = predict(quasibinomial_fit, newdata = new_fits) %>% logit2prob() %>% multiply_by(100) #convert logits to 0-1, then to 0-100

#plot combined
new_fits %>% 
  gather(key = fit, value = value, ols:quasibinomial) %>% 
  ggplot() +
  geom_line(aes(national_IQ, value, color = fit)) +
  geom_point(data = LV, aes(national_IQ, tertiary_2009)) +
  scale_y_continuous("Tertiary enrollment in 2009, %", breaks = seq(0, 100, by = 10)) +
  scale_x_continuous("National IQ") +
  ggtitle("Model fit comparison")
## Warning: Removed 7 rows containing missing values (geom_point).

GG_save("figs/model_fits.png")
## Warning: Removed 7 rows containing missing values (geom_point).
#values
new_fits

Meta

write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] betareg_3.1-3      rms_5.1-4          SparseM_1.78       readxl_1.3.1      
##  [5] kirkegaard_2018.05 metafor_2.4-0      Matrix_1.2-18      psych_1.9.12.31   
##  [9] magrittr_1.5       assertthat_0.2.1   weights_1.0.1      mice_3.8.0        
## [13] gdata_2.18.0       Hmisc_4.4-0        Formula_1.2-3      survival_3.1-12   
## [17] lattice_0.20-41    forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5       
## [21] purrr_0.3.4        readr_1.3.1        tidyr_1.0.3        tibble_3.0.1      
## [25] ggplot2_3.3.0      tidyverse_1.3.0    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10      colorspace_1.4-1    ellipsis_0.3.0     
##  [4] modeltools_0.2-23   htmlTable_1.13.3    base64enc_0.1-3    
##  [7] fs_1.4.1            rstudioapi_0.11     farver_2.0.3       
## [10] MatrixModels_0.4-1  flexmix_2.3-15      fansi_0.4.1        
## [13] mvtnorm_1.1-0       lubridate_1.7.8     xml2_1.3.2         
## [16] codetools_0.2-16    splines_3.6.3       mnormt_1.5-7       
## [19] knitr_1.28          jsonlite_1.6.1      broom_0.5.6        
## [22] cluster_2.1.0       dbplyr_1.4.3        png_0.1-7          
## [25] compiler_3.6.3      httr_1.4.1          backports_1.1.6    
## [28] cli_2.0.2           acepack_1.4.1       htmltools_0.4.0    
## [31] quantreg_5.55       tools_3.6.3         gtable_0.3.0       
## [34] glue_1.4.0          Rcpp_1.0.4.6        cellranger_1.1.0   
## [37] vctrs_0.2.4         nlme_3.1-147        multilevel_2.6     
## [40] lmtest_0.9-37       xfun_0.13           rvest_0.3.5        
## [43] lifecycle_0.2.0     gtools_3.8.2        polspline_1.1.17   
## [46] MASS_7.3-51.6       zoo_1.8-8           scales_1.1.0       
## [49] hms_0.5.3           parallel_3.6.3      sandwich_2.5-1     
## [52] RColorBrewer_1.1-2  psychometric_2.2    yaml_2.2.1         
## [55] gridExtra_2.3       rpart_4.1-15        latticeExtra_0.6-29
## [58] stringi_1.4.6       checkmate_2.0.0     rlang_0.4.6        
## [61] pkgconfig_2.0.3     evaluate_0.14       htmlwidgets_1.5.1  
## [64] labeling_0.3        tidyselect_1.0.0    R6_2.4.1           
## [67] generics_0.0.2      multcomp_1.4-13     DBI_1.1.0          
## [70] pillar_1.4.4        haven_2.2.0         foreign_0.8-76     
## [73] withr_2.2.0         mgcv_1.8-31         nnet_7.3-14        
## [76] modelr_0.1.7        crayon_1.3.4        rmarkdown_2.1      
## [79] jpeg_0.1-8.1        grid_3.6.3          data.table_1.12.8  
## [82] reprex_0.3.0        digest_0.6.25       stats4_3.6.3       
## [85] munsell_0.5.0