knitr::opts_chunk$set(echo = TRUE)
options(scipen=999, digits=6)
source("xval.glm.R")
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(ggplot2)
library(gridExtra)
dt = read.csv("3_Study1data.csv")
dtg = subset(dt, goodsub==1)
dtg0 = dtg
dtg$gender_z = factor(dtg$gender_z)
dtg$education_z = factor(dtg$education_z)
dtg$income_z = factor(dtg$income_z)

models = vector(mode = "list", length = 2)
models[[1]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z
models[[2]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z + K_z
models[[3]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z + poly(K_z, 2)

without dummy

This gives almost identical results as Table 1.

fit1 = glm(models[[1]], data = dtg0)
summary(fit1)
## 
## Call:
## glm(formula = models[[1]], data = dtg0)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.254  -0.316   0.225   0.544   1.830  
## 
## Coefficients:
##                          Estimate            Std. Error t value  Pr(>|t|)    
## (Intercept)  0.000000000000000238  0.045750997700160737    0.00    1.0000    
## age_z        0.122723926121425769  0.048147402538218843    2.55    0.0112 *  
## gender_z    -0.151033565436406003  0.046670112272815187   -3.24    0.0013 ** 
## education_z  0.029006515206507211  0.048986728214616568    0.59    0.5541    
## income_z     0.008717184236623998  0.048161295346713880    0.18    0.8565    
## PHQ_z       -0.444020532142960056  0.094775203699089233   -4.68 0.0000039 ***
## GAD_z        0.178342826068621452  0.089405065648086410    1.99    0.0468 *  
## PSQI_z      -0.080594799761740263  0.061678946084681469   -1.31    0.1921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.830982)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 323.25  on 389  degrees of freedom
## AIC: 1063
## 
## Number of Fisher Scoring iterations: 2
fit2 = glm(models[[2]], data = dtg0)
summary(fit2)
## 
## Call:
## glm(formula = models[[2]], data = dtg0)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.318  -0.310   0.218   0.545   1.714  
## 
## Coefficients:
##                          Estimate            Std. Error t value Pr(>|t|)    
## (Intercept)  0.000000000000000269  0.045029255345376072    0.00  1.00000    
## age_z        0.147985386398776481  0.047881467571003672    3.09  0.00214 ** 
## gender_z    -0.135579243436681651  0.046125056290229456   -2.94  0.00349 ** 
## education_z  0.037016852595417815  0.048262952492987218    0.77  0.44356    
## income_z    -0.005059552761494530  0.047548834565387027   -0.11  0.91531    
## PHQ_z       -0.352424729170637552  0.096537226921704897   -3.65  0.00030 ***
## GAD_z        0.151528952645651610  0.088295208640466236    1.72  0.08693 .  
## PSQI_z      -0.078338728977675270  0.060709022517028727   -1.29  0.19768    
## K_z          0.182098976063317874  0.049433155354782710    3.68  0.00026 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.804971)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 312.33  on 388  degrees of freedom
## AIC: 1051
## 
## Number of Fisher Scoring iterations: 2

CV and predictability (note: without dummy)

output = xval.glm(data = dtg0, models)
## Running Cross-validation...done [ 12.5 sec ]

## Results for (10-fold, 200 repeats)
##   Model:                                          |   Wins   |    2.5% |    mean |   97.5% |
##   [ 1] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.917 |   0.923 |   0.930 |
##   [ 2] socialdistancing_z ~ 1 + age_z + gender_   |   100%   |   0.903 |   0.909 |   0.916 |
##   [ 3] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.905 |   0.911 |   0.919 |

with dummy

For K, no big difference from the results without dummy.

fit1 = glm(models[[1]], data = dtg)
summary(fit1)
## 
## Call:
## glm(formula = models[[1]], data = dtg)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.191  -0.329   0.224   0.575   1.797  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.2362     0.1173    2.01   0.0447 *  
## age_z                           0.1173     0.0481    2.44   0.0151 *  
## gender_z0.996227203217905      -0.2843     0.0931   -3.05   0.0024 ** 
## education_z-0.104987458573891  -0.1358     0.1202   -1.13   0.2592    
## education_z1.19751319935845    -0.1036     0.1496   -0.69   0.4890    
## education_z2.50001385729079     0.3782     0.2506    1.51   0.1321    
## income_z0.365413191926448      -0.0453     0.1027   -0.44   0.6594    
## income_z1.99540237389049        0.1026     0.1731    0.59   0.5538    
## PHQ_z                          -0.4107     0.0952   -4.31  0.00002 ***
## GAD_z                           0.1568     0.0895    1.75   0.0805 .  
## PSQI_z                         -0.0973     0.0623   -1.56   0.1190    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.823252)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 317.78  on 386  degrees of freedom
## AIC: 1062
## 
## Number of Fisher Scoring iterations: 2
fit2 = glm(models[[2]], data = dtg)
summary(fit2)
## 
## Call:
## glm(formula = models[[2]], data = dtg)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.281  -0.303   0.213   0.545   1.687  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.2030     0.1161    1.75  0.08119 .  
## age_z                           0.1423     0.0480    2.97  0.00321 ** 
## gender_z0.996227203217905      -0.2572     0.0922   -2.79  0.00554 ** 
## education_z-0.104987458573891  -0.1083     0.1189   -0.91  0.36257    
## education_z1.19751319935845    -0.0596     0.1481   -0.40  0.68756    
## education_z2.50001385729079     0.3794     0.2472    1.53  0.12563    
## income_z0.365413191926448      -0.0402     0.1013   -0.40  0.69156    
## income_z1.99540237389049        0.0308     0.1721    0.18  0.85806    
## PHQ_z                          -0.3303     0.0968   -3.41  0.00071 ***
## GAD_z                           0.1356     0.0885    1.53  0.12633    
## PSQI_z                         -0.0942     0.0615   -1.53  0.12605    
## K_z                             0.1700     0.0498    3.41  0.00071 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.801166)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 308.45  on 385  degrees of freedom
## AIC: 1052
## 
## Number of Fisher Scoring iterations: 2

CV and predictability (with dummy)

output = xval.glm(data = dtg0, models)
## Running Cross-validation...done [ 12.5 sec ]

## Results for (10-fold, 200 repeats)
##   Model:                                          |   Wins   |    2.5% |    mean |   97.5% |
##   [ 1] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.916 |   0.923 |   0.930 |
##   [ 2] socialdistancing_z ~ 1 + age_z + gender_   |   100%   |   0.902 |   0.909 |   0.917 |
##   [ 3] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.904 |   0.911 |   0.920 |