ESS1_7e01 <- read_csv("ESS Germany 2016/ESS1-7e01.csv") # reading in the data
## Parsed with column specification:
## cols(
## .default = col_character(),
## cedition = col_double(),
## cseqno = col_integer(),
## essround = col_integer(),
## edition = col_double(),
## idno = col_integer(),
## dweight = col_double(),
## pspwght = col_double(),
## pweight = col_double(),
## tvtot = col_integer(),
## tvpol = col_integer(),
## rdtot = col_integer(),
## rdpol = col_integer(),
## nwsptot = col_integer(),
## nwsppol = col_integer(),
## ppltrst = col_integer(),
## pplfair = col_integer(),
## pplhlp = col_integer(),
## polintr = col_integer(),
## polcmpl = col_integer(),
## poldcs = col_integer()
## # ... with 254 more columns
## )
## See spec(...) for full column specifications.
sa <- c("CO", "BR", "AR")# setting the categories for origins
swas <- c("AF", "IN", "LK", "TJ", "TM", "UZ", "KZ", "PK")
seas <- c("CN", "TH", "VN")
am <- c("US", "CA")
pi <- c("ID", "PH")
eu <- c("AU", "CH", "ES", "FI", "FR", "GB", "HU", "NL", "NO", "PL", "PT",
"SK", "EE", "BG", "RO", "TR", "GR", "IT", "AT", "IE")
af <- c("BF", "GH", "GM", "MA", "MZ", "TN", "UG", "EG", "CG")
me <- c("IQ", "LB", "SA", "SY", "IR")
rs <- c("AL", "AM", "AZ", "BA", "BY", "CZ", "HR", "KG", "LT", "LV", "MC",
"RU", "UA", "CS", "02", "03", "04", "MK")
hw1 <- ESS1_7e01 %>% #tidying the data.
select(ctzcntr, livecntr, edulvla, edulvlpa, edulvlma, pdwrk, wkhtot,
uemp3m, hinctnt, health, ctzshipa, cntbrtha, gndr, yrbrn, rlgblg, dscrgrp, jbspv) %>%
filter(livecntr <= 5, hinctnt <= 12, wkhtot <= 665, cntbrtha != c("66", "99"), edulvlpa <= 54,
edulvlma <= 54) %>% # removing livecntr, hinctr, edilvlpa, edulvlms, cntbrtha and wkhtot NAs)
mutate(
origin = ifelse(cntbrtha %in% sa, "SA",
ifelse(cntbrtha %in% swas, "SWAS",
ifelse(cntbrtha %in% seas, "SEAS",
ifelse(cntbrtha %in% am, "AM",
ifelse(cntbrtha %in% pi, "PI",
ifelse(cntbrtha %in% eu, "EU",
ifelse(cntbrtha %in% af, "AF",
ifelse(cntbrtha %in% me, "ME",
ifelse(cntbrtha %in% rs, "RS", NA))))))))),
age = ifelse(2018 - yrbrn >=25, 1 ,0),#setting all variables to binary
gndr = ifelse(gndr > 1, 0, 1),
livecntr = ifelse(livecntr >= 3, 1, 0),
edulvla = ifelse(edulvla >= 3, 1, 0),
edulvlpa = ifelse(edulvlpa >= 3, 1, 0),
edulvlma = ifelse(edulvlma >= 3, 1, 0),
wkhtot = ifelse(wkhtot >= 40, 1, 0),
health = ifelse(health <= 3, 1, 0),
rlgblg = ifelse(rlgblg == 1, 1, 0),
hinctnt = ifelse(hinctnt <= 3, 0, 1)
)%>%
filter(age >= 0) %>%
select(hinctnt, origin, gndr, age, ctzcntr:pdwrk, wkhtot, uemp3m, health, rlgblg, dscrgrp, jbspv) %>%
arrange(origin) %>%
print(hw1)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 286 x 16
## hinctnt origin gndr age ctzcntr livecntr edulvla edulvlpa edulvlma
## <dbl> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 AF 0 1 2 0 1 1 1
## 2 1 AF 1 1 1 1 1 1 0
## 3 1 AF 1 1 2 1 1 0 0
## 4 1 AF 1 1 1 1 1 1 0
## 5 1 AF 0 1 1 1 1 1 1
## 6 1 AF 0 0 2 1 1 1 1
## 7 0 AF 1 1 1 1 1 1 0
## 8 1 AF 0 1 2 0 1 1 1
## 9 1 AM 1 1 2 1 1 1 1
## 10 1 AM 0 1 2 1 1 1 1
## # ... with 276 more rows, and 7 more variables: pdwrk <int>, wkhtot <dbl>,
## # uemp3m <int>, health <dbl>, rlgblg <dbl>, dscrgrp <int>, jbspv <int>
hw2 <- hw1 %>% #fixing hw1's data to make hw2 easier
select(origin, gndr, age, ctzcntr, edulvla, edulvlpa, edulvlma, health, rlgblg)
rm(ESS1_7e01); gc() #cleaning up unused data
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1852869 99.0 3205452 171.2 3205452 171.2
## Vcells 2048526 15.7 40525628 309.2 46723265 356.5
fit1 <- glmer(gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg +
(1|origin), data = hw2, family = binomial, subset= complete.cases(hw2) )
icc(fit1)
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + (1 | origin)
##
## ICC (origin): 0.0136
summary(fit1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +
## rlgblg + (1 | origin)
## Data: hw2
## Subset: complete.cases(hw2)
##
## AIC BIC logLik deviance df.resid
## 388.0 420.8 -185.0 370.0 274
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2236 -0.9176 0.4776 0.9127 1.8086
##
## Random effects:
## Groups Name Variance Std.Dev.
## origin (Intercept) 0.04539 0.213
## Number of obs: 283, groups: origin, 9
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.3578 1.4581 1.617 0.10585
## age -1.0650 1.2204 -0.873 0.38288
## ctzcntr -0.3546 0.2828 -1.254 0.20993
## edulvla 0.6218 0.3859 1.611 0.10711
## health 0.1068 0.4017 0.266 0.79036
## edulvlpa -1.0547 0.3705 -2.847 0.00442 **
## edulvlma -0.5951 0.2695 -2.208 0.02724 *
## rlgblg -0.3914 0.2813 -1.391 0.16418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age ctzcnt edulvl health edlvlp edlvlm
## age -0.869
## ctzcntr -0.340 0.059
## edulvla -0.227 0.057 0.047
## health -0.256 0.031 -0.024 -0.008
## edulvlpa -0.189 0.010 0.210 -0.241 -0.070
## edulvlma 0.047 -0.037 -0.098 -0.169 -0.055 -0.222
## rlgblg -0.236 0.081 -0.027 0.066 0.056 0.030 0.150
origin (Intercept) 0.04539
Residual: 3.2899
ICC (origin): 0.0136
This model was built by subdividing immigrants to Germany, by region of their origin (regions are: South America (SA), South Western Asia (SWAS), South Eastern Asia (SEAS), America (AM)[having a variable named NA (for North America) seemed wrong], Pacific Islands (PI), European Union (EU), Africa (AF), Middle East (ME), Russian Federation and former Russian controlled countries (RU)) and testing to see if we could reduce the vairiability between men and women by controlling by the region thaey originated from. The answer is no, the variation between genders is much stronger within the groups than it is between the groups. Heere is a reminder of what each of the variables are: The other variables I’ve selected from the data are:
* ctzcntr: Citizen of country (1 = Y, 2 = N)
* livecntr How long ago did you immigrate (>= 3 yrs = 1, <3 = 0)
* edulvla Highest level of education (upper secondary education or better = 1, else 0)
* edulvlpa highest education of father, same determinant as above
* edulvlma highest education of mother, same determinant as above
* pdwrk Have you been paid for work in last seven days (1 = Y, 0 = N)
* wkhtot Total number hours normally worked per week. (>=40 = 1, <40 = 0)
* uemp3m Have you been unemployed and looking for work for three or months (1 =Y, 0 =N)
* hinctnt Total household income (reported in deciles, 0 = lower three deciles, else 1 )
* health How is your health (fair or better = 1, else 0)
* age actual age (>= 25 = 1, else 0)
* rlgblg Do you belong to a particular religous denom. (1 = Y, 0 = N)
* dscrgrp Do you feel as if you’ve been discriminated against (1 = Y, 0 = N)
* jbspv Responsible for supervising others (1 = Y, 0 = N)
Estimate another multilevel model where you include random slopes into the model for at least one of your individual level predictors
fit2 <- glmer(gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg +
(1+health|origin), data = hw2, family = binomial, subset= complete.cases(hw2) )
Note: I tried this model with each of the individual predictors and all failed to converge but the health predictor. A reminder that Health has been transformed into a binary variable with a 1 indicating that a person feels they are at least in fair health or better and a 0 indicating they feel they feel they are below fair health.
summary(fit2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +
## rlgblg + (1 + health | origin)
## Data: hw2
## Subset: complete.cases(hw2)
##
## AIC BIC logLik deviance df.resid
## 391.7 431.8 -184.9 369.7 272
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2097 -0.9181 0.4808 0.9052 1.7906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## origin (Intercept) 0.2393 0.4892
## health 0.0812 0.2849 -1.00
## Number of obs: 283, groups: origin, 9
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2670 1.4847 1.527 0.12679
## age -1.0639 1.2158 -0.875 0.38151
## ctzcntr -0.3703 0.2818 -1.314 0.18894
## edulvla 0.6010 0.3886 1.546 0.12200
## health 0.2289 0.5089 0.450 0.65289
## edulvlpa -1.0541 0.3712 -2.840 0.00451 **
## edulvlma -0.5816 0.2709 -2.147 0.03179 *
## rlgblg -0.3932 0.2814 -1.397 0.16236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age ctzcnt edulvl health edlvlp edlvlm
## age -0.847
## ctzcntr -0.309 0.050
## edulvla -0.208 0.056 0.057
## health -0.325 0.020 -0.075 -0.056
## edulvlpa -0.187 0.010 0.209 -0.239 -0.051
## edulvlma 0.033 -0.035 -0.105 -0.176 0.003 -0.221
## rlgblg -0.224 0.078 -0.034 0.065 0.032 0.031 0.148
icc(fit2)
## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma + rlgblg + (1 + health | origin)
##
## ICC (origin): 0.0678
origin (Intercept) 0.2393
Residual: 3.2899
health 0.0812
ICC (origin): 0.0678
Here we see that the variance for the intercept has increased, as has the ICC, but both are fairly small.
b. What are the results of the model?
We see very little change when we account for the average effect of health. I will note that the correlation coeficient is -1.00, an indicator that this model has probably failed to converge, or at the very least it is not a very good fit. I’ve tried every other variable and have not been able to get the model to converge. If I had more time I would try this with another dependent variable and/or another data set.
anova(fit1, fit2)
## Data: hw2
## Subset: complete.cases(hw2)
## Models:
## fit1: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +
## fit1: rlgblg + (1 | origin)
## fit2: gndr ~ age + ctzcntr + edulvla + health + edulvlpa + edulvlma +
## fit2: rlgblg + (1 + health | origin)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 9 388.04 420.85 -185.02 370.04
## fit2 11 391.72 431.82 -184.86 369.72 0.3142 2 0.8546
Here we see that the AIC and BIC values have increased, the loglilkelihood and deviance are slightly better and a p-value of 0.8546, there is insufficient evidence to support the claim that the second model fits better than the first.
Overall, my selection of variables for this data was poor, and I will be moving to a different data set for future exercises.
“Bonus”
Construct a plot of the slope model
#getting random effects
rancoefs2 <- ranef(fit2)
#random effects from origins
head(rancoefs2$origin, n=9)
## (Intercept) health
## AF 0.05995647 -0.03492390
## AM -0.14271357 0.08312889
## EU 0.41286432 -0.24048837
## ME 0.32128186 -0.18714272
## PI -0.02982158 0.01737070
## RS -0.29060153 0.16927181
## SA -0.10358358 0.06033616
## SEAS -0.04105017 0.02391122
## SWAS -0.18426680 0.10733314
#histogram of the random effects
par(mfrow = c(1,2))
hist(rancoefs2$origin[,"(Intercept)"], xlab = "Intercept Random Effect", main = "Distribution of Random Intercepts")
hist(rancoefs2$origin[,"health"], xlab = "Health Slope Random Effect", main = "Distribution of Random Slopes")
With the negative correlation coeficient we would expect to see a negative trend
par(mfrow = c(1,1))
plot(rancoefs2$origin[,"(Intercept)"], rancoefs2$origin[,"health"], main = "Random effects for each Origin", xlab = "Intercept", ylab = "Health Slope")
With the -1 correlation coefficient, we expect this model to look a little TOO perfect, and it does, I expect the number of categories is probably too small. I went back and redid the whole model by country of origin and not by region, increasing the number of groups from 9 to 70, and could not find a single model to converge.
par(mfrow = c(1,1))
fixef(fit2)
## (Intercept) age ctzcntr edulvla health edulvlpa
## 2.2669572 -1.0639162 -0.3702517 0.6010002 0.2288612 -1.0541238
## edulvlma rlgblg
## -0.5815922 -0.3931565
summary(fixef(fit2)[1] + rancoefs2$origin)
## (Intercept) health
## Min. :1.976 Min. :2.026
## 1st Qu.:2.124 1st Qu.:2.232
## Median :2.226 Median :2.291
## Mean :2.267 Mean :2.267
## 3rd Qu.:2.327 3rd Qu.:2.350
## Max. :2.680 Max. :2.436
plot(NULL, ylim = c(1.5, 4.5), xlim = c(0, 1), ylab = "Gender", xlab = "Health Status")
title(main = "Regression Lines for each origin from Random Slope and Intercept Model")
cols = sample(rainbow(n=9), size = dim(rancoefs2$origin)[1], replace = T)
for (i in 1: dim(rancoefs2$origin)[1]) {
abline(a = fixef(fit2)[1] + rancoefs2$origin[[1]][i], b = fixef(fit2)[5] + rancoefs2$origin[[2]][i], col = cols[i], lwd = .5)
}
abline(a = fixef(fit2)[1], b = fixef(fit2)[5], col = 1, lwd = 3)
legend("topright", col = 1, lwd = 5, legend = "Average Effect of Health")