HW2

Estimate the random intercept model for your outcome, use at least five individual level predictors.
1. Report the variance components of the model and describe the results of the model.

ESS1_7e01 <- read_csv("ESS Germany 2016/ESS1-7e01.csv") # reading in the data

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   cedition = col_double(),
##   cseqno = col_integer(),
##   essround = col_integer(),
##   edition = col_double(),
##   idno = col_integer(),
##   dweight = col_double(),
##   pspwght = col_double(),
##   pweight = col_double(),
##   tvtot = col_integer(),
##   tvpol = col_integer(),
##   rdtot = col_integer(),
##   rdpol = col_integer(),
##   nwsptot = col_integer(),
##   nwsppol = col_integer(),
##   ppltrst = col_integer(),
##   pplfair = col_integer(),
##   pplhlp = col_integer(),
##   polintr = col_integer(),
##   polcmpl = col_integer(),
##   poldcs = col_integer()
##   # ... with 254 more columns
## )

## See spec(...) for full column specifications.

sa <- c("CO", "BR", "AR")# setting the categories for origins
swas <- c("AF", "IN", "LK", "TJ", "TM", "UZ", "KZ", "PK")
seas <- c("CN", "TH", "VN")
am <- c("US",  "CA")
pi <- c("ID", "PH")
eu <- c("AU", "CH", "ES", "FI", "FR", "GB", "HU", "NL", "NO", "PL", "PT",
        "SK", "EE", "BG", "RO", "TR", "GR", "IT", "AT", "IE")
af <- c("BF", "GH", "GM", "MA", "MZ", "TN", "UG", "EG", "CG")
me <- c("IQ", "LB", "SA", "SY", "IR")
rs <- c("AL", "AM", "AZ", "BA", "BY", "CZ", "HR", "KG", "LT", "LV", "MC",
        "RU", "UA", "CS", "02", "03", "04", "MK")

hw1 <- ESS1_7e01 %>% #tidying the data.
  select(ctzcntr, livecntr, edulvla, edulvlpa, edulvlma, pdwrk, wkhtot,
         uemp3m, hinctnt, health, ctzshipa, cntbrtha, gndr, yrbrn, rlgblg, dscrgrp, jbspv) %>% 
  filter(livecntr <= 5, hinctnt <= 12, wkhtot <= 665, cntbrtha != c("66", "99"), edulvlpa <= 54, 
         edulvlma <= 54) %>%  # removing livecntr, hinctr, edilvlpa, edulvlms, cntbrtha and wkhtot NAs)
  mutate(
    origin = ifelse(cntbrtha %in% sa, "SA",
                    ifelse(cntbrtha %in% swas, "SWAS",
                           ifelse(cntbrtha %in% seas, "SEAS",
                                  ifelse(cntbrtha %in% am, "AM",
                                         ifelse(cntbrtha %in% pi, "PI",
                                                ifelse(cntbrtha %in% eu, "EU",
                                                       ifelse(cntbrtha %in% af, "AF",
                                                              ifelse(cntbrtha %in% me, "ME",
                                                                     ifelse(cntbrtha %in% rs, "RS", NA))))))))),
    age = ifelse(2018 - yrbrn >=25, 1 ,0),#setting all variables to binary
    gndr = ifelse(gndr > 1, 0, 1),
    livecntr = ifelse(livecntr >= 3, 1, 0),
    edulvla = ifelse(edulvla >= 3, 1, 0),
    edulvlpa = ifelse(edulvlpa >= 3, 1, 0),
    edulvlma = ifelse(edulvlma >= 3, 1, 0),
    wkhtot = ifelse(wkhtot >= 40, 1, 0),
    health = ifelse(health <= 3, 1, 0),
    rlgblg = ifelse(rlgblg == 1, 1, 0),
    hinctnt = ifelse(hinctnt <= 3, 0, 1)
  )%>%
  filter(age >= 0) %>% 
  select(hinctnt, origin, gndr, age, ctzcntr:pdwrk, wkhtot, uemp3m, health, rlgblg, dscrgrp, jbspv) %>% 
  arrange(origin) %>% 
  print(hw1)

## Warning: package 'bindrcpp' was built under R version 3.4.4

## # A tibble: 286 x 16
##    hinctnt origin  gndr   age ctzcntr livecntr edulvla edulvlpa edulvlma
##      <dbl> <chr>  <dbl> <dbl>   <int>    <dbl>   <dbl>    <dbl>    <dbl>
##  1       1 AF         0     1       2        0       1        1        1
##  2       1 AF         1     1       1        1       1        1        0
##  3       1 AF         1     1       2        1       1        0        0
##  4       1 AF         1     1       1        1       1        1        0
##  5       1 AF         0     1       1        1       1        1        1
##  6       1 AF         0     0       2        1       1        1        1
##  7       0 AF         1     1       1        1       1        1        0
##  8       1 AF         0     1       2        0       1        1        1
##  9       1 AM         1     1       2        1       1        1        1
## 10       1 AM         0     1       2        1       1        1        1
## # ... with 276 more rows, and 7 more variables: pdwrk <int>, wkhtot <dbl>,
## #   uemp3m <int>, health <dbl>, rlgblg <dbl>, dscrgrp <int>, jbspv <int>

hw2 <- hw1 %>% #fixing hw1's data to make hw2 easier
  select(origin, gndr, age, ctzcntr, edulvla, edulvlpa, edulvlma, health, rlgblg)

rm(ESS1_7e01); gc() #cleaning up unused data

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1852869 99.0    3205452 171.2  3205452 171.2
## Vcells 2048526 15.7   40525628 309.2 46723265 356.5

fit1 <- glmer(gndr ~ age  + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + 
                (1|origin), data = hw2, family = binomial, subset= complete.cases(hw2) )
icc(fit1)

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + (1 | origin)
## 
##   ICC (origin): 0.0136

summary(fit1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +  
##     rlgblg + (1 | origin)
##    Data: hw2
##  Subset: complete.cases(hw2)
## 
##      AIC      BIC   logLik deviance df.resid 
##    388.0    420.8   -185.0    370.0      274 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2236 -0.9176  0.4776  0.9127  1.8086 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  origin (Intercept) 0.04539  0.213   
## Number of obs: 283, groups:  origin, 9
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.3578     1.4581   1.617  0.10585   
## age          -1.0650     1.2204  -0.873  0.38288   
## ctzcntr      -0.3546     0.2828  -1.254  0.20993   
## edulvla       0.6218     0.3859   1.611  0.10711   
## health        0.1068     0.4017   0.266  0.79036   
## edulvlpa     -1.0547     0.3705  -2.847  0.00442 **
## edulvlma     -0.5951     0.2695  -2.208  0.02724 * 
## rlgblg       -0.3914     0.2813  -1.391  0.16418   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) age    ctzcnt edulvl health edlvlp edlvlm
## age      -0.869                                          
## ctzcntr  -0.340  0.059                                   
## edulvla  -0.227  0.057  0.047                            
## health   -0.256  0.031 -0.024 -0.008                     
## edulvlpa -0.189  0.010  0.210 -0.241 -0.070              
## edulvlma  0.047 -0.037 -0.098 -0.169 -0.055 -0.222       
## rlgblg   -0.236  0.081 -0.027  0.066  0.056  0.030  0.150

Variance Components:

origin (Intercept) 0.04539
Residual: 3.2899
ICC (origin): 0.0136

This model was built by subdividing immigrants to Germany, by region of their origin (regions are: South America (SA), South Western Asia (SWAS), South Eastern Asia (SEAS), America (AM)[having a variable named NA (for North America) seemed wrong], Pacific Islands (PI), European Union (EU), Africa (AF), Middle East (ME), Russian Federation and former Russian controlled countries (RU)) and testing to see if we could reduce the vairiability between men and women by controlling by the region thaey originated from. The answer is no, the variation between genders is much stronger within the groups than it is between the groups. Heere is a reminder of what each of the variables are: The other variables I’ve selected from the data are:
* ctzcntr: Citizen of country (1 = Y, 2 = N)
* livecntr How long ago did you immigrate (>= 3 yrs = 1, <3 = 0)
* edulvla Highest level of education (upper secondary education or better = 1, else 0)
* edulvlpa highest education of father, same determinant as above
* edulvlma highest education of mother, same determinant as above
* pdwrk Have you been paid for work in last seven days (1 = Y, 0 = N)
* wkhtot Total number hours normally worked per week. (>=40 = 1, <40 = 0)
* uemp3m Have you been unemployed and looking for work for three or months (1 =Y, 0 =N)
* hinctnt Total household income (reported in deciles, 0 = lower three deciles, else 1 )
* health How is your health (fair or better = 1, else 0)
* age actual age (>= 25 = 1, else 0)
* rlgblg Do you belong to a particular religous denom. (1 = Y, 0 = N)
* dscrgrp Do you feel as if you’ve been discriminated against (1 = Y, 0 = N)
* jbspv Responsible for supervising others (1 = Y, 0 = N)

Report the results for the fixed effects
The only variables of significance were education levels of both parents, pvalues of father=0.00442 and mother pvalue 0.02724. All of the other variables were not signficant contributors.

Estimate another multilevel model where you include random slopes into the model for at least one of your individual level predictors

fit2 <- glmer(gndr ~ age  + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + 
                (1+health|origin), data = hw2, family = binomial, subset= complete.cases(hw2) )

Note: I tried this model with each of the individual predictors and all failed to converge but the health predictor. A reminder that Health has been transformed into a binary variable with a 1 indicating that a person feels they are at least in fair health or better and a 0 indicating they feel they feel they are below fair health.

Report the variance components of the model

summary(fit2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +  
##     rlgblg + (1 + health | origin)
##    Data: hw2
##  Subset: complete.cases(hw2)
## 
##      AIC      BIC   logLik deviance df.resid 
##    391.7    431.8   -184.9    369.7      272 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2097 -0.9181  0.4808  0.9052  1.7906 
## 
## Random effects:
##  Groups Name        Variance Std.Dev. Corr 
##  origin (Intercept) 0.2393   0.4892        
##         health      0.0812   0.2849   -1.00
## Number of obs: 283, groups:  origin, 9
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.2670     1.4847   1.527  0.12679   
## age          -1.0639     1.2158  -0.875  0.38151   
## ctzcntr      -0.3703     0.2818  -1.314  0.18894   
## edulvla       0.6010     0.3886   1.546  0.12200   
## health        0.2289     0.5089   0.450  0.65289   
## edulvlpa     -1.0541     0.3712  -2.840  0.00451 **
## edulvlma     -0.5816     0.2709  -2.147  0.03179 * 
## rlgblg       -0.3932     0.2814  -1.397  0.16236   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) age    ctzcnt edulvl health edlvlp edlvlm
## age      -0.847                                          
## ctzcntr  -0.309  0.050                                   
## edulvla  -0.208  0.056  0.057                            
## health   -0.325  0.020 -0.075 -0.056                     
## edulvlpa -0.187  0.010  0.209 -0.239 -0.051              
## edulvlma  0.033 -0.035 -0.105 -0.176  0.003 -0.221       
## rlgblg   -0.224  0.078 -0.034  0.065  0.032  0.031  0.148

icc(fit2)

## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + (1 + health | origin)
## 
##   ICC (origin): 0.0678

Variance Components

origin (Intercept) 0.2393
Residual: 3.2899
health 0.0812
ICC (origin): 0.0678

Here we see that the variance for the intercept has increased, as has the ICC, but both are fairly small.

b. What are the results of the model?

We see very little change when we account for the average effect of health. I will note that the correlation coeficient is -1.00, an indicator that this model has probably failed to converge, or at the very least it is not a very good fit. I’ve tried every other variable and have not been able to get the model to converge. If I had more time I would try this with another dependent variable and/or another data set.

Does it fit better than the model from part 1?

anova(fit1, fit2)

## Data: hw2
## Subset: complete.cases(hw2)
## Models:
## fit1: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + 
## fit1:     rlgblg + (1 | origin)
## fit2: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + 
## fit2:     rlgblg + (1 + health | origin)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit1  9 388.04 420.85 -185.02   370.04                         
## fit2 11 391.72 431.82 -184.86   369.72 0.3142      2     0.8546

Here we see that the AIC and BIC values have increased, the loglilkelihood and deviance are slightly better and a p-value of 0.8546, there is insufficient evidence to support the claim that the second model fits better than the first.

Overall, my selection of variables for this data was poor, and I will be moving to a different data set for future exercises.

“Bonus”

Construct a plot of the slope model

#getting random effects
rancoefs2 <- ranef(fit2)
#random effects from origins
head(rancoefs2$origin, n=9)

##      (Intercept)      health
## AF    0.05995647 -0.03492390
## AM   -0.14271357  0.08312889
## EU    0.41286432 -0.24048837
## ME    0.32128186 -0.18714272
## PI   -0.02982158  0.01737070
## RS   -0.29060153  0.16927181
## SA   -0.10358358  0.06033616
## SEAS -0.04105017  0.02391122
## SWAS -0.18426680  0.10733314

#histogram of the random effects
par(mfrow = c(1,2))
hist(rancoefs2$origin[,"(Intercept)"], xlab = "Intercept Random Effect", main = "Distribution of Random Intercepts")
hist(rancoefs2$origin[,"health"], xlab = "Health Slope Random Effect", main = "Distribution of Random Slopes")

With the negative correlation coeficient we would expect to see a negative trend

par(mfrow = c(1,1))
plot(rancoefs2$origin[,"(Intercept)"], rancoefs2$origin[,"health"], main = "Random effects for each Origin", xlab = "Intercept", ylab = "Health Slope")

With the -1 correlation coefficient, we expect this model to look a little TOO perfect, and it does, I expect the number of categories is probably too small. I went back and redid the whole model by country of origin and not by region, increasing the number of groups from 9 to 70, and could not find a single model to converge.

par(mfrow = c(1,1))
fixef(fit2)

## (Intercept)         age     ctzcntr     edulvla      health    edulvlpa 
##   2.2669572  -1.0639162  -0.3702517   0.6010002   0.2288612  -1.0541238 
##    edulvlma      rlgblg 
##  -0.5815922  -0.3931565

summary(fixef(fit2)[1] + rancoefs2$origin)

##   (Intercept)        health     
##  Min.   :1.976   Min.   :2.026  
##  1st Qu.:2.124   1st Qu.:2.232  
##  Median :2.226   Median :2.291  
##  Mean   :2.267   Mean   :2.267  
##  3rd Qu.:2.327   3rd Qu.:2.350  
##  Max.   :2.680   Max.   :2.436

plot(NULL, ylim = c(1.5, 4.5), xlim = c(0, 1), ylab = "Gender", xlab = "Health Status")
title(main = "Regression Lines for each origin from Random Slope and Intercept Model")
cols = sample(rainbow(n=9), size = dim(rancoefs2$origin)[1], replace = T)
for (i in 1: dim(rancoefs2$origin)[1]) {
    abline(a = fixef(fit2)[1] + rancoefs2$origin[[1]][i], b = fixef(fit2)[5] + rancoefs2$origin[[2]][i], col = cols[i], lwd = .5)  
}

abline(a = fixef(fit2)[1], b = fixef(fit2)[5], col = 1, lwd = 3)
legend("topright", col = 1, lwd = 5, legend = "Average Effect of Health")

HW2

Craig, Scott

9/22/2018

Variance Components:

Variance Components