This is a notebook to accomplany this blogpost about using OLS with bounded outcome variables.
The data are from Table 3.6 in Lynn and Vanhanen’s 2012 book Intelligence: A unifying construct for the social sciences.
options(digits = 2)
library(pacman)
p_load(kirkegaard, readxl, rms, betareg)
theme_set(theme_bw())
#logits to probability
#https://sebastiansauer.github.io/convert_logit2prob/
logit2prob <- function(logit){
odds <- exp(logit)
prob <- odds / (1 + odds)
return(prob)
}
LV = read_excel("data/LV12 data.xlsx") %>%
df_legalize_names()
Get a similar plot to the one in LV book.
#plot
GG_scatter(LV, "national_IQ", "tertiary_2009", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#OLS fit
(ols_fit = ols(tertiary_2009 ~ national_IQ, data = LV))
## Frequencies of Missing Values Due to Each Variable
## tertiary_2009 national_IQ
## 7 0
##
## Linear Regression Model
##
## ols(formula = tertiary_2009 ~ national_IQ, data = LV)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 192 LR chi2 169.89 R2 0.587
## sigma16.3755 d.f. 1 R2 adj 0.585
## d.f. 190 Pr(> chi2) 0.0000 g 22.391
##
## Residuals
##
## Min 1Q Median 3Q Max
## -45.683 -11.049 -1.525 9.150 49.859
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -119.9170 9.1677 -13.08 <0.0001
## national_IQ 1.7732 0.1079 16.44 <0.0001
##
#impossible values in original
LV %>% filter(LV_fitted < 0 | LV_fitted > 100) %>% arrange(-national_IQ)
#replication values
ols_fit_values = tibble(
national_IQ = LV$national_IQ,
fitted = predict(ols_fit)
)
#print again
ols_fit_values %>% filter(fitted < 0 | fitted > 100) %>% arrange(-national_IQ)
We try simplest stuff, and custom chosen.
#when predicting, we need these indexes
na_cases = (is.na(LV$national_IQ) | is.na(LV$tertiary_2009))
#scale to 0-1
LV$tertiary_2009_unit = LV$tertiary_2009 / 100
LV$tertiary_2009_unit_winsorise = LV$tertiary_2009_unit %>% winsorise(upper = .99, lower = .01)
#winsorize to 1 and 99
LV$tertiary_2009_winsorise = LV$tertiary_2009 %>% winsorise(upper = 99, lower = 1)
#log ols
(log_ols_fit = ols(log(tertiary_2009_winsorise) ~ national_IQ, data = LV))
## Frequencies of Missing Values Due to Each Variable
## log(tertiary_2009_winsorise) national_IQ
## 7 0
##
## Linear Regression Model
##
## ols(formula = log(tertiary_2009_winsorise) ~ national_IQ, data = LV)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 192 LR chi2 167.78 R2 0.583
## sigma0.7590 d.f. 1 R2 adj 0.580
## d.f. 190 Pr(> chi2) 0.0000 g 1.028
##
## Residuals
##
## Min 1Q Median 3Q Max
## -3.46743 -0.45395 0.05274 0.46620 1.85971
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -3.9983 0.4249 -9.41 <0.0001
## national_IQ 0.0814 0.0050 16.29 <0.0001
##
#fit spline
(spline_fit = ols(tertiary_2009 ~ rcs(national_IQ), data = LV))
## Frequencies of Missing Values Due to Each Variable
## tertiary_2009 national_IQ
## 7 0
##
## Linear Regression Model
##
## ols(formula = tertiary_2009 ~ rcs(national_IQ), data = LV)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 192 LR chi2 196.16 R2 0.640
## sigma15.4148 d.f. 4 R2 adj 0.632
## d.f. 187 Pr(> chi2) 0.0000 g 22.801
##
## Residuals
##
## Min 1Q Median 3Q Max
## -48.683 -7.332 -1.980 7.854 57.705
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -37.6976 40.6879 -0.93 0.3554
## national_IQ 0.6516 0.5800 1.12 0.2627
## national_IQ' 0.1330 2.6801 0.05 0.9605
## national_IQ'' 13.6901 10.9076 1.26 0.2110
## national_IQ''' -44.4484 17.7928 -2.50 0.0133
##
#loess fit
(loess_fit = loess(tertiary_2009 ~ national_IQ, data = LV))
## Call:
## loess(formula = tertiary_2009 ~ national_IQ, data = LV)
##
## Number of Observations: 192
## Equivalent Number of Parameters: 4.7
## Residual Standard Error: 15
#beta fit
#this assumes scaled data, 0-1
#gives misleading error when some data are 0 or 1
(beta_fit = betareg(tertiary_2009_unit_winsorise ~ national_IQ, data = LV))
##
## Call:
## betareg(formula = tertiary_2009_unit_winsorise ~ national_IQ, data = LV)
##
## Coefficients (mean model with logit link):
## (Intercept) national_IQ
## -8.4568 0.0886
##
## Phi coefficients (precision model with identity link):
## (phi)
## 6.57
#quasibinomial
#produces nonsense results
(quasibinomial_fit = glm(tertiary_2009_unit ~ national_IQ, data = LV, family = quasibinomial(logit)))
##
## Call: glm(formula = tertiary_2009_unit ~ national_IQ, family = quasibinomial(logit),
## data = LV)
##
## Coefficients:
## (Intercept) national_IQ
## -9.761 0.103
##
## Degrees of Freedom: 191 Total (i.e. Null); 190 Residual
## (7 observations deleted due to missingness)
## Null Deviance: 63
## Residual Deviance: 25 AIC: NA
#Weibull
#gave up
#these nls functions are straight out retarded
#the GUI software I used in high school 10 years ago are better
#put in data, click fit to [any variety of function types], get results
#in R, have to waste time guessing at start params
#and since these are not intuitive, it's a PINA
#???
#why has no one fixed this
#weibull_fit <- nls(tertiary_2009 ~ SSweibull(LV$national_IQ, Asym = 100, Drop = 100,lrc = -5,pwr = 2), data = LV)
#SSasymp
# singular gradient
#gave up
#these functions suck ass
#fit <- nls(tertiary_2009 ~ SSasymp(national_IQ, yf, y0, log_alpha), data = LV)
#fit comparison
IQ_range = seq(min(LV$national_IQ, na.rm = T), max(LV$national_IQ, na.rm = T))
new_fits = tibble(
national_IQ = IQ_range
)
#insert predictions
new_fits$ols = predict(ols_fit, newdata = new_fits)
new_fits$log_ols = predict(log_ols_fit, newdata = new_fits) %>% exp() #convert back using exp()
new_fits$spline = predict(spline_fit, newdata = new_fits)
new_fits$loess = predict(loess_fit, newdata = new_fits)
new_fits$beta = predict(beta_fit, newdata = new_fits) * 100 #get back to 0-100 format
new_fits$quasibinomial = predict(quasibinomial_fit, newdata = new_fits) %>% logit2prob() %>% multiply_by(100) #convert logits to 0-1, then to 0-100
#plot combined
new_fits %>%
gather(key = fit, value = value, ols:quasibinomial) %>%
ggplot() +
geom_line(aes(national_IQ, value, color = fit)) +
geom_point(data = LV, aes(national_IQ, tertiary_2009)) +
scale_y_continuous("Tertiary enrollment in 2009, %", breaks = seq(0, 100, by = 10)) +
scale_x_continuous("National IQ") +
ggtitle("Model fit comparison")
## Warning: Removed 7 rows containing missing values (geom_point).
GG_save("figs/model_fits.png")
## Warning: Removed 7 rows containing missing values (geom_point).
#values
new_fits
write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] betareg_3.1-3 rms_5.1-4 SparseM_1.78 readxl_1.3.1
## [5] kirkegaard_2018.05 metafor_2.4-0 Matrix_1.2-18 psych_1.9.12.31
## [9] magrittr_1.5 assertthat_0.2.1 weights_1.0.1 mice_3.8.0
## [13] gdata_2.18.0 Hmisc_4.4-0 Formula_1.2-3 survival_3.1-12
## [17] lattice_0.20-41 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5
## [21] purrr_0.3.4 readr_1.3.1 tidyr_1.0.3 tibble_3.0.1
## [25] ggplot2_3.3.0 tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ellipsis_0.3.0
## [4] modeltools_0.2-23 htmlTable_1.13.3 base64enc_0.1-3
## [7] fs_1.4.1 rstudioapi_0.11 farver_2.0.3
## [10] MatrixModels_0.4-1 flexmix_2.3-15 fansi_0.4.1
## [13] mvtnorm_1.1-0 lubridate_1.7.8 xml2_1.3.2
## [16] codetools_0.2-16 splines_3.6.3 mnormt_1.5-7
## [19] knitr_1.28 jsonlite_1.6.1 broom_0.5.6
## [22] cluster_2.1.0 dbplyr_1.4.3 png_0.1-7
## [25] compiler_3.6.3 httr_1.4.1 backports_1.1.6
## [28] cli_2.0.2 acepack_1.4.1 htmltools_0.4.0
## [31] quantreg_5.55 tools_3.6.3 gtable_0.3.0
## [34] glue_1.4.0 Rcpp_1.0.4.6 cellranger_1.1.0
## [37] vctrs_0.2.4 nlme_3.1-147 multilevel_2.6
## [40] lmtest_0.9-37 xfun_0.13 rvest_0.3.5
## [43] lifecycle_0.2.0 gtools_3.8.2 polspline_1.1.17
## [46] MASS_7.3-51.6 zoo_1.8-8 scales_1.1.0
## [49] hms_0.5.3 parallel_3.6.3 sandwich_2.5-1
## [52] RColorBrewer_1.1-2 psychometric_2.2 yaml_2.2.1
## [55] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-29
## [58] stringi_1.4.6 checkmate_2.0.0 rlang_0.4.6
## [61] pkgconfig_2.0.3 evaluate_0.14 htmlwidgets_1.5.1
## [64] labeling_0.3 tidyselect_1.0.0 R6_2.4.1
## [67] generics_0.0.2 multcomp_1.4-13 DBI_1.1.0
## [70] pillar_1.4.4 haven_2.2.0 foreign_0.8-76
## [73] withr_2.2.0 mgcv_1.8-31 nnet_7.3-14
## [76] modelr_0.1.7 crayon_1.3.4 rmarkdown_2.1
## [79] jpeg_0.1-8.1 grid_3.6.3 data.table_1.12.8
## [82] reprex_0.3.0 digest_0.6.25 stats4_3.6.3
## [85] munsell_0.5.0