knitr::opts_chunk$set(echo = TRUE)
options(scipen=999, digits=6)
source("xval.glm.R")
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(ggplot2)
library(gridExtra)
dt = read.csv("3_Study1data.csv")
dtg = subset(dt, goodsub==1)
dtg0 = dtg
dtg$gender_z = factor(dtg$gender_z)
dtg$education_z = factor(dtg$education_z)
dtg$income_z = factor(dtg$income_z)

models = vector(mode = "list", length = 2)
models[[1]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z
models[[2]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z + K_z
models[[3]] = socialdistancing_z ~ 1 + age_z + gender_z + education_z + income_z + PHQ_z + GAD_z + PSQI_z + poly(K_z, 3)

without dummy

This gives almost identical results as Table 1.

fit1 = glm(models[[1]], data = dtg0)
summary(fit1)
## 
## Call:
## glm(formula = models[[1]], data = dtg0)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.254  -0.316   0.225   0.544   1.830  
## 
## Coefficients:
##                          Estimate            Std. Error t value  Pr(>|t|)    
## (Intercept)  0.000000000000000238  0.045750997700160737    0.00    1.0000    
## age_z        0.122723926121425769  0.048147402538218843    2.55    0.0112 *  
## gender_z    -0.151033565436406003  0.046670112272815187   -3.24    0.0013 ** 
## education_z  0.029006515206507211  0.048986728214616568    0.59    0.5541    
## income_z     0.008717184236623998  0.048161295346713880    0.18    0.8565    
## PHQ_z       -0.444020532142960056  0.094775203699089233   -4.68 0.0000039 ***
## GAD_z        0.178342826068621452  0.089405065648086410    1.99    0.0468 *  
## PSQI_z      -0.080594799761740263  0.061678946084681469   -1.31    0.1921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.830982)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 323.25  on 389  degrees of freedom
## AIC: 1063
## 
## Number of Fisher Scoring iterations: 2
fit2 = glm(models[[2]], data = dtg0)
summary(fit2)
## 
## Call:
## glm(formula = models[[2]], data = dtg0)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.318  -0.310   0.218   0.545   1.714  
## 
## Coefficients:
##                          Estimate            Std. Error t value Pr(>|t|)    
## (Intercept)  0.000000000000000269  0.045029255345376072    0.00  1.00000    
## age_z        0.147985386398776481  0.047881467571003672    3.09  0.00214 ** 
## gender_z    -0.135579243436681651  0.046125056290229456   -2.94  0.00349 ** 
## education_z  0.037016852595417815  0.048262952492987218    0.77  0.44356    
## income_z    -0.005059552761494530  0.047548834565387027   -0.11  0.91531    
## PHQ_z       -0.352424729170637552  0.096537226921704897   -3.65  0.00030 ***
## GAD_z        0.151528952645651610  0.088295208640466236    1.72  0.08693 .  
## PSQI_z      -0.078338728977675270  0.060709022517028727   -1.29  0.19768    
## K_z          0.182098976063317874  0.049433155354782710    3.68  0.00026 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.804971)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 312.33  on 388  degrees of freedom
## AIC: 1051
## 
## Number of Fisher Scoring iterations: 2

CV and predictability (note: without dummy)

output = xval.glm(data = dtg0, models)
## Running Cross-validation...done [ 12.7 sec ]

## Results for (10-fold, 200 repeats)
##   Model:                                          |   Wins   |    2.5% |    mean |   97.5% |
##   [ 1] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.917 |   0.922 |   0.929 |
##   [ 2] socialdistancing_z ~ 1 + age_z + gender_   |   100%   |   0.903 |   0.909 |   0.916 |
##   [ 3] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.907 |   0.914 |   0.923 |

with dummy

For K, no big difference from the results without dummy.

fit1 = glm(models[[1]], data = dtg)
summary(fit1)
## 
## Call:
## glm(formula = models[[1]], data = dtg)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.191  -0.329   0.224   0.575   1.797  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.2362     0.1173    2.01   0.0447 *  
## age_z                           0.1173     0.0481    2.44   0.0151 *  
## gender_z0.996227203217905      -0.2843     0.0931   -3.05   0.0024 ** 
## education_z-0.104987458573891  -0.1358     0.1202   -1.13   0.2592    
## education_z1.19751319935845    -0.1036     0.1496   -0.69   0.4890    
## education_z2.50001385729079     0.3782     0.2506    1.51   0.1321    
## income_z0.365413191926448      -0.0453     0.1027   -0.44   0.6594    
## income_z1.99540237389049        0.1026     0.1731    0.59   0.5538    
## PHQ_z                          -0.4107     0.0952   -4.31  0.00002 ***
## GAD_z                           0.1568     0.0895    1.75   0.0805 .  
## PSQI_z                         -0.0973     0.0623   -1.56   0.1190    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.823252)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 317.78  on 386  degrees of freedom
## AIC: 1062
## 
## Number of Fisher Scoring iterations: 2
fit2 = glm(models[[2]], data = dtg)
summary(fit2)
## 
## Call:
## glm(formula = models[[2]], data = dtg)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.281  -0.303   0.213   0.545   1.687  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.2030     0.1161    1.75  0.08119 .  
## age_z                           0.1423     0.0480    2.97  0.00321 ** 
## gender_z0.996227203217905      -0.2572     0.0922   -2.79  0.00554 ** 
## education_z-0.104987458573891  -0.1083     0.1189   -0.91  0.36257    
## education_z1.19751319935845    -0.0596     0.1481   -0.40  0.68756    
## education_z2.50001385729079     0.3794     0.2472    1.53  0.12563    
## income_z0.365413191926448      -0.0402     0.1013   -0.40  0.69156    
## income_z1.99540237389049        0.0308     0.1721    0.18  0.85806    
## PHQ_z                          -0.3303     0.0968   -3.41  0.00071 ***
## GAD_z                           0.1356     0.0885    1.53  0.12633    
## PSQI_z                         -0.0942     0.0615   -1.53  0.12605    
## K_z                             0.1700     0.0498    3.41  0.00071 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.801166)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 308.45  on 385  degrees of freedom
## AIC: 1052
## 
## Number of Fisher Scoring iterations: 2
fit3 = glm(models[[3]], data = dtg)
summary(fit3)
## 
## Call:
## glm(formula = models[[3]], data = dtg)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.285  -0.305   0.209   0.546   1.684  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.2034     0.1165    1.75  0.08146 .  
## age_z                           0.1421     0.0481    2.95  0.00333 ** 
## gender_z0.996227203217905      -0.2572     0.0925   -2.78  0.00572 ** 
## education_z-0.104987458573891  -0.1090     0.1194   -0.91  0.36198    
## education_z1.19751319935845    -0.0589     0.1491   -0.39  0.69308    
## education_z2.50001385729079     0.3780     0.2486    1.52  0.12924    
## income_z0.365413191926448      -0.0407     0.1026   -0.40  0.69193    
## income_z1.99540237389049        0.0313     0.1726    0.18  0.85616    
## PHQ_z                          -0.3295     0.0976   -3.37  0.00082 ***
## GAD_z                           0.1353     0.0888    1.52  0.12840    
## PSQI_z                         -0.0943     0.0617   -1.53  0.12716    
## poly(K_z, 3)1                   3.3878     0.9953    3.40  0.00073 ***
## poly(K_z, 3)2                  -0.0940     0.9159   -0.10  0.91828    
## poly(K_z, 3)3                  -0.0587     0.9117   -0.06  0.94868    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.805318)
## 
##     Null deviance: 396.00  on 396  degrees of freedom
## Residual deviance: 308.44  on 383  degrees of freedom
## AIC: 1056
## 
## Number of Fisher Scoring iterations: 2

CV and predictability (with dummy)

output = xval.glm(data = dtg0, models)
## Running Cross-validation...done [ 12.7 sec ]

## Results for (10-fold, 200 repeats)
##   Model:                                          |   Wins   |    2.5% |    mean |   97.5% |
##   [ 1] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.917 |   0.922 |   0.930 |
##   [ 2] socialdistancing_z ~ 1 + age_z + gender_   |   100%   |   0.903 |   0.909 |   0.916 |
##   [ 3] socialdistancing_z ~ 1 + age_z + gender_   |     0%   |   0.906 |   0.914 |   0.921 |