Data is from:
who publicly shared their survey dataset, n = 1000, from 2012. The dataset has about 50 variables.
options(contrasts=c("contr.treatment", "contr.treatment"))
library(pacman)
p_load(kirkegaard, rms, caret)
d = read_csv("data/AYTM-Results.csv") %>% df_legalize_names()
## Parsed with column specification:
## cols(
## .default = col_character(),
## ID = col_integer(),
## age = col_double(),
## `Years Higher Ed` = col_integer(),
## Income = col_integer(),
## `expected longevity` = col_double(),
## health = col_integer(),
## `Science interest` = col_integer(),
## Religiosity = col_integer(),
## `Life Satisfaction` = col_integer(),
## `Self Esteem` = col_integer(),
## Optimism = col_integer(),
## diseases = col_integer()
## )
## See spec(...) for full column specifications.
#which variables exist?
labels(d)[[2]]
## [1] "ID"
## [2] "DATE"
## [3] "Preamble"
## [4] "Age_Range"
## [5] "age"
## [6] "Career"
## [7] "Children"
## [8] "Education"
## [9] "Years_Higher_Ed"
## [10] "Employment_Status"
## [11] "Ethnicity_Race"
## [12] "Gender"
## [13] "Income_Range"
## [14] "Income"
## [15] "Relationship_Status"
## [16] "State"
## [17] "County"
## [18] "City"
## [19] "Zip"
## [20] "Q2_If_you_could_be_physically_mentally_the_same_as_your_20s_how_long_would_you_like_to_live"
## [21] "Q3_If_you_could_be_physically_the_same_but_NOT_mentally_the_same_as_your_20s_how_long_would_you_like_to_live"
## [22] "Q4_If_you_could_be_mentally_the_same_but_NOT_physically_the_same_as_your_20s_how_long_would_you_like_to_live"
## [23] "Q5_How_long_do_you_expect_to_live"
## [24] "expected_longevity"
## [25] "Q6_How_healthy_do_you_feel_physically_and_mentally"
## [26] "health"
## [27] "Q7_How_interested_are_you_in_topics_related_to_science_and_technology"
## [28] "Science_interest"
## [29] "Q8_How_important_is_religion_in_your_life"
## [30] "Religiosity"
## [31] "Q9_All_things_considered_how_satisfied_are_you_with_your_life_as_a_whole"
## [32] "Life_Satisfaction"
## [33] "Q10_I_have_high_self_esteem"
## [34] "Self_Esteem"
## [35] "Q11_I_m_always_optimistic_about_my_future"
## [36] "Optimism"
## [37] "diseases"
## [38] "Q12_A1_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [39] "Q12_A2_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [40] "Q12_A3_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [41] "Q12_A4_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [42] "Q12_A5_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [43] "Q12_A6_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [44] "Q12_A7_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [45] "Q12_A8_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [46] "Q12_A9_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
## [47] "Q12_A10_Have_you_ever_been_diagnosed_with_any_of_the_following_health_issues"
#main outcome of interest is interest in living indefinitely long if no physical and mental decline
d$Q2_If_you_could_be_physically_mentally_the_same_as_your_20s_how_long_would_you_like_to_live %>% table2()
d$liveforever_ord = d$Q2_If_you_could_be_physically_mentally_the_same_as_your_20s_how_long_would_you_like_to_live %>% ordered(levels = c("85", "120", "150", "Unlimited"))
d$liveforever = d$Q2_If_you_could_be_physically_mentally_the_same_as_your_20s_how_long_would_you_like_to_live == "Unlimited"
#recode
d$Age_Range %<>% ordered()
d$race = d$Ethnicity_Race %>% factor()
d$sex = d$Gender %>% factor()
d$Education %<>% ordered(levels = c("Grad school degree", "Professional degree", "4yr degree", "2yr degree", "Some college", "No college") %>% rev())
d$Science_interest %<>% as.numeric()
d$Life_Satisfaction %<>% as.numeric()
d$Optimism %<>% as.numeric()
d$Self_Esteem %<>% as.numeric()
d$expected_longevity %<>% as.numeric()
d$Religiosity %<>% as.numeric()
#age x sex
GG_group_means(d, "liveforever", groupvar = "Age_Range", subgroupvar = "sex")
#race x sex
GG_group_means(d, "liveforever", groupvar = "race", subgroupvar = "sex") +
theme(axis.text.x = element_text(angle = -20, hjust = 0))
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Warning: Removed 2 rows containing missing values (geom_errorbar).
#education?
GG_group_means(d, "liveforever", "Education")
GG_group_means(d, "Years_Higher_Ed", "liveforever_ord")
#science interest?
GG_group_means(d, "liveforever", "Science_interest")
#life satisfaction?
GG_group_means(d, "liveforever", "Life_Satisfaction")
#religion
GG_group_means(d, "liveforever", "Religiosity")
#expected lifespan
GG_group_means(d, "liveforever", "expected_longevity")
#optimism
GG_group_means(d, "liveforever", "Optimism")
#demographics
lrm(liveforever ~ age + sex + race, data = d)
## Logistic Regression Model
##
## lrm(formula = liveforever ~ age + sex + race, data = d)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1000 LR chi2 10.03 R2 0.013 C 0.548
## FALSE 577 d.f. 9 g 0.207 Dxy 0.096
## TRUE 423 Pr(> chi2) 0.3477 gr 1.230 gamma 0.102
## max |deriv| 0.1 gp 0.049 tau-a 0.047
## Brier 0.242
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -0.6017 0.2527 -2.38 0.0173
## age 0.0005 0.0047 0.11 0.9160
## sex=Male 0.2036 0.1373 1.48 0.1381
## race=Asian-American 0.3818 0.3241 1.18 0.2388
## race=Hispanic/Latino-American 0.4778 0.3162 1.51 0.1307
## race=Indian-American -5.6582 22.6367 -0.25 0.8026
## race=Multi-racial 0.7418 0.3733 1.99 0.0469
## race=Native American 0.9494 0.9344 1.02 0.3096
## race=Other 0.1227 0.5628 0.22 0.8275
## race=White American 0.1635 0.2120 0.77 0.4405
##
#occupation + education + income
lrm(liveforever ~ age + sex + race + Career + Education + Income, data = d)
## Logistic Regression Model
##
## lrm(formula = liveforever ~ age + sex + race + Career + Education +
## Income, data = d)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1000 LR chi2 38.63 R2 0.051 C 0.613
## FALSE 577 d.f. 30 g 0.460 Dxy 0.225
## TRUE 423 Pr(> chi2) 0.1343 gr 1.584 gamma 0.226
## max |deriv| 20 gp 0.106 tau-a 0.110
## Brier 0.235
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -1.2003 0.4982 -2.41 0.0160
## age -0.0025 0.0053 -0.47 0.6383
## sex=Male 0.2850 0.1467 1.94 0.0521
## race=Asian-American 0.3867 0.3326 1.16 0.2450
## race=Hispanic/Latino-American 0.3815 0.3274 1.17 0.2440
## race=Indian-American -5.2816 22.6385 -0.23 0.8155
## race=Multi-racial 0.7616 0.3881 1.96 0.0497
## race=Native American 0.9777 0.9928 0.98 0.3247
## race=Other 0.1624 0.5800 0.28 0.7794
## race=White American 0.0826 0.2175 0.38 0.7040
## Career=Advertising/Graphic design 0.5113 0.9653 0.53 0.5963
## Career=Arts and entertainment 1.0241 0.6312 1.62 0.1047
## Career=Clerical 0.9766 0.4819 2.03 0.0427
## Career=Healthcare 0.6415 0.4525 1.42 0.1563
## Career=Hospitality 0.9928 0.5473 1.81 0.0697
## Career=IT 0.1607 0.4909 0.33 0.7434
## Career=Legal 0.8405 0.9965 0.84 0.3990
## Career=Management 1.3074 0.4966 2.63 0.0085
## Career=Military 0.3647 0.6989 0.52 0.6018
## Career=Other 0.9835 0.3881 2.53 0.0113
## Career=Public safety 0.3714 0.9507 0.39 0.6960
## Career=Real estate 1.8477 1.2925 1.43 0.1528
## Career=Retail 0.0127 0.5175 0.02 0.9804
## Career=Small business owner 0.9528 0.4428 2.15 0.0314
## Career=Student 0.6472 0.4189 1.54 0.1224
## Education=Some college -0.1321 0.1824 -0.72 0.4689
## Education=2yr degree 0.1075 0.2644 0.41 0.6842
## Education=4yr degree -0.2932 0.2105 -1.39 0.1637
## Education=Professional degree -0.1911 0.5308 -0.36 0.7189
## Education=Grad school degree -0.3878 0.3150 -1.23 0.2182
## Income 0.0000 0.0000 1.29 0.1954
##
#alternative: years of education
lrm(liveforever ~ age + sex + race + Career + Years_Higher_Ed + Income, data = d)
## Logistic Regression Model
##
## lrm(formula = liveforever ~ age + sex + race + Career + Years_Higher_Ed +
## Income, data = d)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1000 LR chi2 37.07 R2 0.049 C 0.610
## FALSE 577 d.f. 26 g 0.447 Dxy 0.220
## TRUE 423 Pr(> chi2) 0.0737 gr 1.563 gamma 0.220
## max |deriv| 20 gp 0.103 tau-a 0.107
## Brier 0.235
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -1.2085 0.4813 -2.51 0.0120
## age -0.0024 0.0053 -0.45 0.6499
## sex=Male 0.2797 0.1462 1.91 0.0557
## race=Asian-American 0.3774 0.3319 1.14 0.2554
## race=Hispanic/Latino-American 0.3826 0.3269 1.17 0.2418
## race=Indian-American -5.3583 22.6381 -0.24 0.8129
## race=Multi-racial 0.7548 0.3875 1.95 0.0514
## race=Native American 1.0805 0.9867 1.10 0.2734
## race=Other 0.1529 0.5750 0.27 0.7903
## race=White American 0.0816 0.2171 0.38 0.7072
## Career=Advertising/Graphic design 0.5235 0.9560 0.55 0.5840
## Career=Arts and entertainment 1.0198 0.6307 1.62 0.1059
## Career=Clerical 0.9876 0.4810 2.05 0.0401
## Career=Healthcare 0.6715 0.4506 1.49 0.1362
## Career=Hospitality 1.0218 0.5454 1.87 0.0610
## Career=IT 0.1795 0.4897 0.37 0.7139
## Career=Legal 0.9337 0.9911 0.94 0.3462
## Career=Management 1.3073 0.4946 2.64 0.0082
## Career=Military 0.4647 0.6925 0.67 0.5022
## Career=Other 0.9890 0.3861 2.56 0.0104
## Career=Public safety 0.3967 0.9488 0.42 0.6759
## Career=Real estate 1.8227 1.2916 1.41 0.1582
## Career=Retail 0.0283 0.5148 0.06 0.9561
## Career=Small business owner 0.9698 0.4404 2.20 0.0277
## Career=Student 0.6575 0.4132 1.59 0.1116
## Years_Higher_Ed -0.0613 0.0397 -1.54 0.1227
## Income 0.0000 0.0000 1.42 0.1542
##
#bunch of stuff
lrm(liveforever ~ age + sex + race + Career + Years_Higher_Ed + Income + Science_interest + Life_Satisfaction + Optimism + Self_Esteem + expected_longevity, data = d)
## Logistic Regression Model
##
## lrm(formula = liveforever ~ age + sex + race + Career + Years_Higher_Ed +
## Income + Science_interest + Life_Satisfaction + Optimism +
## Self_Esteem + expected_longevity, data = d)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1000 LR chi2 96.08 R2 0.123 C 0.659
## FALSE 577 d.f. 31 g 0.736 Dxy 0.318
## TRUE 423 Pr(> chi2) <0.0001 gr 2.088 gamma 0.318
## max |deriv| 20 gp 0.161 tau-a 0.155
## Brier 0.222
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -3.4437 0.6027 -5.71 <0.0001
## age -0.0025 0.0055 -0.45 0.6507
## sex=Male 0.2916 0.1550 1.88 0.0599
## race=Asian-American 0.4533 0.3466 1.31 0.1910
## race=Hispanic/Latino-American 0.4360 0.3421 1.27 0.2025
## race=Indian-American -6.1725 22.6388 -0.27 0.7851
## race=Multi-racial 0.6298 0.4047 1.56 0.1197
## race=Native American 1.7049 1.0155 1.68 0.0932
## race=Other 0.3787 0.5858 0.65 0.5179
## race=White American 0.1965 0.2280 0.86 0.3888
## Career=Advertising/Graphic design 0.1178 0.9986 0.12 0.9061
## Career=Arts and entertainment 0.9512 0.6497 1.46 0.1432
## Career=Clerical 0.9920 0.4962 2.00 0.0456
## Career=Healthcare 0.6576 0.4651 1.41 0.1573
## Career=Hospitality 1.1754 0.5568 2.11 0.0348
## Career=IT 0.1801 0.5003 0.36 0.7188
## Career=Legal 0.7994 1.0143 0.79 0.4306
## Career=Management 1.3474 0.5057 2.66 0.0077
## Career=Military 0.5766 0.7209 0.80 0.4238
## Career=Other 1.0038 0.3977 2.52 0.0116
## Career=Public safety 0.4950 0.9571 0.52 0.6050
## Career=Real estate 1.0628 1.3769 0.77 0.4402
## Career=Retail 0.0188 0.5306 0.04 0.9718
## Career=Small business owner 0.8186 0.4559 1.80 0.0726
## Career=Student 0.6173 0.4254 1.45 0.1467
## Years_Higher_Ed -0.0699 0.0423 -1.65 0.0980
## Income 0.0000 0.0000 1.14 0.2528
## Science_interest 0.1437 0.0699 2.05 0.0400
## Life_Satisfaction -0.0846 0.0988 -0.86 0.3921
## Optimism -0.1370 0.0774 -1.77 0.0767
## Self_Esteem -0.0038 0.0705 -0.05 0.9568
## expected_longevity 0.0222 0.0035 6.42 <0.0001
##
#oh magic elastic net, give us a useful model that makes sense
set.seed(1)
lasso_fit = train(
factor(liveforever) ~ age + sex + race + Career + Years_Higher_Ed + Income + Science_interest + Life_Satisfaction + Optimism + Self_Esteem + expected_longevity + Children + Employment_Status + Relationship_Status + health + Religiosity + diseases + State,
data = d,
method = "glmnet",
trControl = trainControl(method = "repeatedcv", repeats = 5),
tuneLength = 5
)
#results
lasso_fit
## glmnet
##
## 1000 samples
## 18 predictors
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 900, 900, 901, 899, 900, 900, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.100 9.511741e-05 0.6060622 0.1693412
## 0.100 4.414959e-04 0.6060622 0.1693412
## 0.100 2.049243e-03 0.6066702 0.1696919
## 0.100 9.511742e-03 0.6118644 0.1782904
## 0.100 4.414960e-02 0.6208370 0.1852773
## 0.325 9.511741e-05 0.6062602 0.1698039
## 0.325 4.414959e-04 0.6066622 0.1702979
## 0.325 2.049243e-03 0.6074602 0.1706385
## 0.325 9.511742e-03 0.6148446 0.1809314
## 0.325 4.414960e-02 0.6292191 0.1799524
## 0.550 9.511741e-05 0.6062582 0.1696513
## 0.550 4.414959e-04 0.6066622 0.1699760
## 0.550 2.049243e-03 0.6092482 0.1731963
## 0.550 9.511742e-03 0.6182409 0.1839932
## 0.550 4.414960e-02 0.6276508 0.1569912
## 0.775 9.511741e-05 0.6062582 0.1697839
## 0.775 4.414959e-04 0.6066662 0.1700042
## 0.775 2.049243e-03 0.6112502 0.1769872
## 0.775 9.511742e-03 0.6236210 0.1912087
## 0.775 4.414960e-02 0.6266408 0.1455082
## 1.000 9.511741e-05 0.6072622 0.1717246
## 1.000 4.414959e-04 0.6064682 0.1695362
## 1.000 2.049243e-03 0.6126525 0.1796938
## 1.000 9.511742e-03 0.6248230 0.1890795
## 1.000 4.414960e-02 0.6184206 0.1219997
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.325 and lambda
## = 0.0441496.
#which predictors
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 99 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.607671e+00
## age .
## sexMale 4.933121e-02
## raceAsian-American .
## raceHispanic/Latino-American 7.394128e-03
## raceIndian-American -1.437955e-01
## raceMulti-racial 1.455951e-01
## raceNative American 2.459370e-01
## raceOther .
## raceWhite American .
## CareerAdvertising/Graphic design -3.333310e-03
## CareerArts and entertainment 6.897849e-04
## CareerClerical .
## CareerHealthcare .
## CareerHospitality .
## CareerIT -1.508079e-01
## CareerLegal .
## CareerManagement 3.912302e-01
## CareerMilitary .
## CareerOther 1.716653e-01
## CareerPublic safety .
## CareerReal estate .
## CareerRetail -2.346164e-01
## CareerSmall business owner .
## CareerStudent .
## Years_Higher_Ed -2.252624e-02
## Income 9.687431e-09
## Science_interest 7.221984e-02
## Life_Satisfaction -2.357435e-02
## Optimism -3.949394e-02
## Self_Esteem .
## expected_longevity 1.491291e-02
## Children1 -3.528126e-01
## Children2 .
## Children3 .
## Children4 -9.796371e-02
## Children5+ 9.486326e-04
## Employment_StatusPart Time -1.044687e-01
## Employment_StatusRetired .
## Employment_StatusStudent .
## Employment_StatusUnemployed .
## Relationship_StatusEngaged .
## Relationship_StatusIt's complicated .
## Relationship_StatusLiving with a significant other -2.852810e-02
## Relationship_StatusMarried .
## Relationship_StatusSingle .
## Relationship_StatusWidowed .
## health -1.066967e-02
## Religiosity -7.177290e-02
## diseases .
## StateAlaska 7.692897e-01
## StateArizona 3.359407e-01
## StateArkansas 4.154366e-02
## StateCalifornia .
## StateColorado 3.343543e-01
## StateConnecticut .
## StateDelaware .
## StateFlorida .
## StateGeorgia .
## StateHawaii .
## StateIdaho .
## StateIllinois -2.615531e-01
## StateIndiana 4.293655e-01
## StateIowa .
## StateKansas .
## StateKentucky 2.916981e-01
## StateLouisiana .
## StateMaine .
## StateMaryland .
## StateMassachusetts .
## StateMichigan 2.413585e-01
## StateMinnesota .
## StateMississippi 1.041464e+00
## StateMissouri -4.586109e-02
## StateMontana 9.861931e-01
## StateNebraska -1.657738e-01
## StateNevada .
## StateNew Hampshire .
## StateNew Jersey .
## StateNew Mexico 1.937324e-02
## StateNew York 2.886469e-01
## StateNorth Carolina .
## StateNorth Dakota .
## StateOhio .
## StateOklahoma 4.781194e-01
## StateOregon -2.199041e-01
## StatePennsylvania -2.972974e-01
## StateRhode Island 6.331001e-01
## StateSouth Carolina .
## StateSouth Dakota .
## StateTennessee 1.246740e+00
## StateTexas .
## StateUtah .
## StateVermont -3.300403e-01
## StateVirginia 8.493022e-02
## StateWashington .
## StateWest Virginia .
## StateWisconsin -4.151013e-01
## StateWyoming -1.386743e-01
Apparently, it seems fairly random who wants to live forever. Even a quite expensive model based on 18 predictors (99 when expanded to dummies) was unable to do much better than chance, with kappa about 0.19. If we were to attempt an interpretation based on the largest coefficients:
If this doesn’t strike you as the typical autistic longegivty-type stereotype, then maybe that is why the authors didn’t do any fancy models either, and why they just put out a brief paper with the open data.