Binary exercise

PISA data 2009 from Columbia

# school-effects model for binary outcome
#

# set options
options(digits = 4, show.signif.stars = FALSE)

# load package management
library(pacman)

# load packages to be used later
p_load(tidyverse, lme4, sjPlot)

# load R data format
load("PisaCol.RData")

# variable names
names(PisaCol)

##   [1] "CNT"         "SUBNATIO"    "STRATUM"     "OECD"        "NC"         
##   [6] "SCHOOLID"    "StIDStd"     "ST01Q01"     "ST02Q01"     "ST03Q01"    
##  [11] "ST03Q02"     "ST04Q01"     "ST05Q01"     "ST06Q01"     "ST07Q01"    
##  [16] "ST07Q02"     "ST07Q03"     "ST08Q01"     "ST09Q01"     "ST115Q01"   
##  [21] "ST11Q01"     "ST11Q02"     "ST11Q03"     "ST11Q04"     "ST11Q05"    
##  [26] "ST11Q06"     "ST13Q01"     "ST14Q01"     "ST14Q02"     "ST14Q03"    
##  [31] "ST14Q04"     "ST15Q01"     "ST17Q01"     "ST18Q01"     "ST18Q02"    
##  [36] "ST18Q03"     "ST18Q04"     "ST19Q01"     "ST20Q01"     "ST20Q02"    
##  [41] "ST20Q03"     "ST21Q01"     "ST25Q01"     "ST26Q01"     "ST26Q02"    
##  [46] "ST26Q03"     "ST26Q04"     "ST26Q05"     "ST26Q06"     "ST26Q07"    
##  [51] "ST26Q08"     "ST26Q09"     "ST26Q10"     "ST26Q11"     "ST26Q12"    
##  [56] "ST26Q13"     "ST26Q14"     "ST26Q15"     "ST26Q16"     "ST26Q17"    
##  [61] "ST27Q01"     "ST27Q02"     "ST27Q03"     "ST27Q04"     "ST27Q05"    
##  [66] "ST28Q01"     "ST29Q01"     "ST29Q02"     "ST29Q03"     "ST29Q04"    
##  [71] "ST29Q05"     "ST29Q06"     "ST29Q07"     "ST29Q08"     "ST35Q01"    
##  [76] "ST35Q02"     "ST35Q03"     "ST35Q04"     "ST35Q05"     "ST35Q06"    
##  [81] "ST37Q01"     "ST37Q02"     "ST37Q03"     "ST37Q04"     "ST37Q05"    
##  [86] "ST37Q06"     "ST37Q07"     "ST37Q08"     "ST42Q01"     "ST42Q02"    
##  [91] "ST42Q03"     "ST42Q04"     "ST42Q05"     "ST42Q06"     "ST42Q07"    
##  [96] "ST42Q08"     "ST42Q09"     "ST42Q10"     "ST43Q01"     "ST43Q02"    
## [101] "ST43Q03"     "ST43Q04"     "ST43Q05"     "ST43Q06"     "ST44Q01"    
## [106] "ST44Q03"     "ST44Q04"     "ST44Q05"     "ST44Q07"     "ST44Q08"    
## [111] "ST46Q01"     "ST46Q02"     "ST46Q03"     "ST46Q04"     "ST46Q05"    
## [116] "ST46Q06"     "ST46Q07"     "ST46Q08"     "ST46Q09"     "ST48Q01"    
## [121] "ST48Q02"     "ST48Q03"     "ST48Q04"     "ST48Q05"     "ST49Q01"    
## [126] "ST49Q02"     "ST49Q03"     "ST49Q04"     "ST49Q05"     "ST49Q06"    
## [131] "ST49Q07"     "ST49Q09"     "ST53Q01"     "ST53Q02"     "ST53Q03"    
## [136] "ST53Q04"     "ST55Q01"     "ST55Q02"     "ST55Q03"     "ST55Q04"    
## [141] "ST57Q01"     "ST57Q02"     "ST57Q03"     "ST57Q04"     "ST57Q05"    
## [146] "ST57Q06"     "ST61Q01"     "ST61Q02"     "ST61Q03"     "ST61Q04"    
## [151] "ST61Q05"     "ST61Q06"     "ST61Q07"     "ST61Q08"     "ST61Q09"    
## [156] "ST62Q01"     "ST62Q02"     "ST62Q03"     "ST62Q04"     "ST62Q06"    
## [161] "ST62Q07"     "ST62Q08"     "ST62Q09"     "ST62Q10"     "ST62Q11"    
## [166] "ST62Q12"     "ST62Q13"     "ST62Q15"     "ST62Q16"     "ST62Q17"    
## [171] "ST62Q19"     "ST69Q01"     "ST69Q02"     "ST69Q03"     "ST70Q01"    
## [176] "ST70Q02"     "ST70Q03"     "ST71Q01"     "ST72Q01"     "ST73Q01"    
## [181] "ST73Q02"     "ST74Q01"     "ST74Q02"     "ST75Q01"     "ST75Q02"    
## [186] "ST76Q01"     "ST76Q02"     "ST77Q01"     "ST77Q02"     "ST77Q04"    
## [191] "ST77Q05"     "ST77Q06"     "ST79Q01"     "ST79Q02"     "ST79Q03"    
## [196] "ST79Q04"     "ST79Q05"     "ST79Q06"     "ST79Q07"     "ST79Q08"    
## [201] "ST79Q10"     "ST79Q11"     "ST79Q12"     "ST79Q15"     "ST79Q17"    
## [206] "ST80Q01"     "ST80Q04"     "ST80Q05"     "ST80Q06"     "ST80Q07"    
## [211] "ST80Q08"     "ST80Q09"     "ST80Q10"     "ST80Q11"     "ST81Q01"    
## [216] "ST81Q02"     "ST81Q03"     "ST81Q04"     "ST81Q05"     "ST82Q01"    
## [221] "ST82Q02"     "ST82Q03"     "ST83Q01"     "ST83Q02"     "ST83Q03"    
## [226] "ST83Q04"     "ST84Q01"     "ST84Q02"     "ST84Q03"     "ST85Q01"    
## [231] "ST85Q02"     "ST85Q03"     "ST85Q04"     "ST86Q01"     "ST86Q02"    
## [236] "ST86Q03"     "ST86Q04"     "ST86Q05"     "ST87Q01"     "ST87Q02"    
## [241] "ST87Q03"     "ST87Q04"     "ST87Q05"     "ST87Q06"     "ST87Q07"    
## [246] "ST87Q08"     "ST87Q09"     "ST88Q01"     "ST88Q02"     "ST88Q03"    
## [251] "ST88Q04"     "ST89Q02"     "ST89Q03"     "ST89Q04"     "ST89Q05"    
## [256] "ST91Q01"     "ST91Q02"     "ST91Q03"     "ST91Q04"     "ST91Q05"    
## [261] "ST91Q06"     "ST93Q01"     "ST93Q03"     "ST93Q04"     "ST93Q06"    
## [266] "ST93Q07"     "ST94Q05"     "ST94Q06"     "ST94Q09"     "ST94Q10"    
## [271] "ST94Q14"     "ST96Q01"     "ST96Q02"     "ST96Q03"     "ST96Q05"    
## [276] "ST101Q01"    "ST101Q02"    "ST101Q03"    "ST101Q05"    "ST104Q01"   
## [281] "ST104Q04"    "ST104Q05"    "ST104Q06"    "IC01Q01"     "IC01Q02"    
## [286] "IC01Q03"     "IC01Q04"     "IC01Q05"     "IC01Q06"     "IC01Q07"    
## [291] "IC01Q08"     "IC01Q09"     "IC01Q10"     "IC01Q11"     "IC02Q01"    
## [296] "IC02Q02"     "IC02Q03"     "IC02Q04"     "IC02Q05"     "IC02Q06"    
## [301] "IC02Q07"     "IC03Q01"     "IC04Q01"     "IC05Q01"     "IC06Q01"    
## [306] "IC07Q01"     "IC08Q01"     "IC08Q02"     "IC08Q03"     "IC08Q04"    
## [311] "IC08Q05"     "IC08Q06"     "IC08Q07"     "IC08Q08"     "IC08Q09"    
## [316] "IC08Q11"     "IC09Q01"     "IC09Q02"     "IC09Q03"     "IC09Q04"    
## [321] "IC09Q05"     "IC09Q06"     "IC09Q07"     "IC10Q01"     "IC10Q02"    
## [326] "IC10Q03"     "IC10Q04"     "IC10Q05"     "IC10Q06"     "IC10Q07"    
## [331] "IC10Q08"     "IC10Q09"     "IC11Q01"     "IC11Q02"     "IC11Q03"    
## [336] "IC11Q04"     "IC11Q05"     "IC11Q06"     "IC11Q07"     "IC22Q01"    
## [341] "IC22Q02"     "IC22Q04"     "IC22Q06"     "IC22Q07"     "IC22Q08"    
## [346] "EC01Q01"     "EC02Q01"     "EC03Q01"     "EC03Q02"     "EC03Q03"    
## [351] "EC03Q04"     "EC03Q05"     "EC03Q06"     "EC03Q07"     "EC03Q08"    
## [356] "EC03Q09"     "EC03Q10"     "EC04Q01A"    "EC04Q01B"    "EC04Q01C"   
## [361] "EC04Q02A"    "EC04Q02B"    "EC04Q02C"    "EC04Q03A"    "EC04Q03B"   
## [366] "EC04Q03C"    "EC04Q04A"    "EC04Q04B"    "EC04Q04C"    "EC04Q05A"   
## [371] "EC04Q05B"    "EC04Q05C"    "EC04Q06A"    "EC04Q06B"    "EC04Q06C"   
## [376] "EC05Q01"     "EC06Q01"     "EC07Q01"     "EC07Q02"     "EC07Q03"    
## [381] "EC07Q04"     "EC07Q05"     "EC08Q01"     "EC08Q02"     "EC08Q03"    
## [386] "EC08Q04"     "EC09Q03"     "EC10Q01"     "EC11Q02"     "EC11Q03"    
## [391] "EC12Q01"     "ST22Q01"     "ST23Q01"     "ST23Q02"     "ST23Q03"    
## [396] "ST23Q04"     "ST23Q05"     "ST23Q06"     "ST23Q07"     "ST23Q08"    
## [401] "ST24Q01"     "ST24Q02"     "ST24Q03"     "CLCUSE1"     "CLCUSE301"  
## [406] "CLCUSE302"   "Deffort"     "QuestID"     "Bookid"      "EASY"       
## [411] "AGE"         "GRADE"       "progn"       "ANXMAT"      "ATSCHL"     
## [416] "ATTLNACT"    "BELONG"      "BFMJ2"       "BMMJ1"       "CLSMAN"     
## [421] "COBN_F"      "COBN_M"      "COBN_S"      "COGACT"      "CULTDIST"   
## [426] "CULTPOS"     "DISCLIMA"    "ENTUSE"      "ESCS"        "EXAPPLM"    
## [431] "EXPUREM"     "FAILMAT"     "FAMCON"      "FAMCONC"     "FAMSTRUC"   
## [436] "fisced"      "HEDRES"      "HERITCUL"    "hisced"      "hisei"      
## [441] "HOMEPOS"     "HOMSCH"      "HOSTCUL"     "ICTATTNEG"   "ICTATTPOS"  
## [446] "ICTHOME"     "ICTSCH"      "IMMIG"       "INFOCAR"     "INFOJOB1"   
## [451] "INFOJOB2"    "INSTMOT"     "INTMAT"      "iscedd"      "iscedl"     
## [456] "iscedo"      "LANGCOMM"    "LANGN"       "LANGRPPD"    "LMINS"      
## [461] "MATBEH"      "MATHEFF"     "MATINTFC"    "MATWKETH"    "misced"     
## [466] "MMINS"       "MTSUP"       "OCOD1"       "OCOD2"       "OPENPS"     
## [471] "OUTHOURS"    "PARED"       "PERSEV"      "REPEAT"      "SCMAT"      
## [476] "SMINS"       "STUDREL"     "SUBNORM"     "TCHBEHFA"    "TCHBEHSO"   
## [481] "TCHBEHTD"    "TEACHSUP"    "TestLANG"    "TIMEINT"     "USEMATH"    
## [486] "USESCH"      "WEALTH"      "ANCATSCHL"   "ANCATTLNACT" "ANCBELONG"  
## [491] "ANCCLSMAN"   "ANCCOGACT"   "ANCINSTMOT"  "ANCINTMAT"   "ANCMATWKETH"
## [496] "ANCMTSUP"    "ANCSCMAT"    "ANCSTUDREL"  "ANCSUBNORM"  "PV1MATH"    
## [501] "PV2MATH"     "PV3MATH"     "PV4MATH"     "PV5MATH"     "PV1MACC"    
## [506] "PV2MACC"     "PV3MACC"     "PV4MACC"     "PV5MACC"     "PV1MACQ"    
## [511] "PV2MACQ"     "PV3MACQ"     "PV4MACQ"     "PV5MACQ"     "PV1MACS"    
## [516] "PV2MACS"     "PV3MACS"     "PV4MACS"     "PV5MACS"     "PV1MACU"    
## [521] "PV2MACU"     "PV3MACU"     "PV4MACU"     "PV5MACU"     "PV1MAPE"    
## [526] "PV2MAPE"     "PV3MAPE"     "PV4MAPE"     "PV5MAPE"     "PV1MAPF"    
## [531] "PV2MAPF"     "PV3MAPF"     "PV4MAPF"     "PV5MAPF"     "PV1MAPI"    
## [536] "PV2MAPI"     "PV3MAPI"     "PV4MAPI"     "PV5MAPI"     "PV1READ"    
## [541] "PV2READ"     "PV3READ"     "PV4READ"     "PV5READ"     "PV1SCIE"    
## [546] "PV2SCIE"     "PV3SCIE"     "PV4SCIE"     "PV5SCIE"     "W_FSTUWT"   
## [551] "W_FSTR1"     "W_FSTR2"     "W_FSTR3"     "W_FSTR4"     "W_FSTR5"    
## [556] "W_FSTR6"     "W_FSTR7"     "W_FSTR8"     "W_FSTR9"     "W_FSTR10"   
## [561] "W_FSTR11"    "W_FSTR12"    "W_FSTR13"    "W_FSTR14"    "W_FSTR15"   
## [566] "W_FSTR16"    "W_FSTR17"    "W_FSTR18"    "W_FSTR19"    "W_FSTR20"   
## [571] "W_FSTR21"    "W_FSTR22"    "W_FSTR23"    "W_FSTR24"    "W_FSTR25"   
## [576] "W_FSTR26"    "W_FSTR27"    "W_FSTR28"    "W_FSTR29"    "W_FSTR30"   
## [581] "W_FSTR31"    "W_FSTR32"    "W_FSTR33"    "W_FSTR34"    "W_FSTR35"   
## [586] "W_FSTR36"    "W_FSTR37"    "W_FSTR38"    "W_FSTR39"    "W_FSTR40"   
## [591] "W_FSTR41"    "W_FSTR42"    "W_FSTR43"    "W_FSTR44"    "W_FSTR45"   
## [596] "W_FSTR46"    "W_FSTR47"    "W_FSTR48"    "W_FSTR49"    "W_FSTR50"   
## [601] "W_FSTR51"    "W_FSTR52"    "W_FSTR53"    "W_FSTR54"    "W_FSTR55"   
## [606] "W_FSTR56"    "W_FSTR57"    "W_FSTR58"    "W_FSTR59"    "W_FSTR60"   
## [611] "W_FSTR61"    "W_FSTR62"    "W_FSTR63"    "W_FSTR64"    "W_FSTR65"   
## [616] "W_FSTR66"    "W_FSTR67"    "W_FSTR68"    "W_FSTR69"    "W_FSTR70"   
## [621] "W_FSTR71"    "W_FSTR72"    "W_FSTR73"    "W_FSTR74"    "W_FSTR75"   
## [626] "W_FSTR76"    "W_FSTR77"    "W_FSTR78"    "W_FSTR79"    "W_FSTR80"   
## [631] "WVARSTRR"    "VAR_UNIT"    "senwgt_STU"  "VER_STU"

# slect only four variables of interest and recode them
dta <- PisaCol %>%
  select(SCHOOLID, StIDStd, ANXMAT, PV2MATH) %>%
  mutate(School = factor(SCHOOLID), Student = factor(StIDStd),
         Anxiety = ifelse(ANXMAT > 0, 1, 0), Math = PV2MATH) %>%
  select(School, Student, ANXMAT, Anxiety, Math) %>%
  as.data.frame

# add school level variables  
dta <- dta %>% 
  group_by(School) %>% 
  mutate(sMath = scale(Math), smMath = mean(Math, na.rm = T)) %>%
  as.data.frame

# scaled mean school math
dta$ssmMath <- scale(dta$smMath)

# have a look
head(dta)

##    School Student ANXMAT Anxiety  Math   sMath smMath ssmMath
## 1 0000001   00001   -0.2       0 357.7  0.1071  351.3 -0.7216
## 2 0000001   00002     NA      NA 325.8 -0.4280  351.3 -0.7216
## 3 0000001   00003     NA      NA 459.2  1.8079  351.3 -0.7216
## 4 0000001   00004     NA      NA 335.1 -0.2714  351.3 -0.7216
## 5 0000001   00005     NA      NA 240.9 -1.8495  351.3 -0.7216
## 6 0000001   00006     NA      NA 390.6  0.6579  351.3 -0.7216

# data structure
str(dta)

## 'data.frame':    9073 obs. of  8 variables:
##  $ School : Factor w/ 352 levels "0000001","0000002",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Student: Factor w/ 9073 levels "00001","00002",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ANXMAT : atomic  -0.2 NA NA NA NA NA 0.92 0.06 0.32 1.53 ...
##   ..- attr(*, "value.labels")= Named num 
##   .. ..- attr(*, "names")= chr 
##  $ Anxiety: num  0 NA NA NA NA NA 1 1 1 1 ...
##  $ Math   : atomic  358 326 459 335 241 ...
##   ..- attr(*, "value.labels")= Named num 
##   .. ..- attr(*, "names")= chr 
##  $ sMath  : num  0.107 -0.428 1.808 -0.271 -1.849 ...
##  $ smMath : num  351 351 351 351 351 ...
##  $ ssmMath: num [1:9073, 1] -0.722 -0.722 -0.722 -0.722 -0.722 ...
##   ..- attr(*, "scaled:center")= num 386
##   ..- attr(*, "scaled:scale")= num 47.8

# set black-n-whire plot theme
theme_set(theme_bw())

# math and anxiety in linear form
ggplot(dta, aes(Math, ANXMAT, group = School))+
  geom_point(alpha = .3, size = rel(.8)) +
  stat_ellipse(aes(group = 1)) +
  stat_smooth(method = "lm", se = F, size = rel(.5), alpha = .5, col="gray75") +
  stat_smooth(aes(group = 1), method = "lm", col = "gray35")+
  stat_smooth(aes(smMath, ANXMAT, group = 1), method = "lm", col = "gray15")+
  geom_hline(yintercept = mean(dta$ANXMAT, na.rm = T), linetype = "dashed") +
  geom_vline(xintercept = mean(dta$Math, na.rm = T), linetype = "dashed")+
  labs(x = "Math score", y = "Anxity score")

## Warning: Removed 3381 rows containing non-finite values (stat_ellipse).

## Warning: Removed 3381 rows containing non-finite values (stat_smooth).

## Warning: Removed 3381 rows containing non-finite values (stat_smooth).

## Warning: Removed 3381 rows containing non-finite values (stat_smooth).

## Warning: Removed 3381 rows containing missing values (geom_point).

# math and anxiety in yes-no form
ggplot(dta, aes(sMath, Anxiety, group = School)) +
  geom_point(alpha = .3, size = rel(.8)) +
  stat_smooth(method = "glm", method.args = list(family = binomial), se = F,
              size = rel(.4), alpha = .3, col = "gray75")+
  stat_smooth(aes(group = 1), method = "glm", 
              method.args = list(family = binomial), col = "gray35")+
  stat_smooth(aes(ssmMath, Anxiety, group = 1), method = "glm", 
              method.args = list(family = binomial), col = "gray15")+
  geom_hline(yintercept = .5, linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed")+
  labs(x = "Math score (individual or school mean)", y = "Anxiety (Yes, No)") +
  theme(legend.position = "NONE")

## Warning: Removed 3386 rows containing non-finite values (stat_smooth).

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: Removed 3386 rows containing non-finite values (stat_smooth).

## Warning: Removed 3381 rows containing non-finite values (stat_smooth).

## Warning: Removed 3386 rows containing missing values (geom_point).

##
# account for school variation only with an individual level predictor
m0 <- glmer(Anxiety ~ sMath + (1 | School), data = dta, 
            family = binomial, na.action = na.omit)


# add a school-level predictor
m1 <- glmer(Anxiety ~ sMath + ssmMath + (1 | School), data = dta, 
            family = binomial, na.action = na.omit)

# model comparisons
anova(m0, m1)

## Data: dta
## Models:
## m0: Anxiety ~ sMath + (1 | School)
## m1: Anxiety ~ sMath + ssmMath + (1 | School)
##    Df  AIC  BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## m0  3 6115 6135  -3054     6109                        
## m1  4 6067 6093  -3029     6059  49.7      1    1.8e-12

##
# model plots and diagnostics
# plot qq-plot of random effects nomality
# sjPlot package - not very good still
#
sjp.glmer(m1, type = "re.qq")

## Testing for normal distribution. Dots should be plotted along the line.

# plot random-effects
plot_model(m1, type = "re")

# plot fixed-effect parameters with CIs
plot_model(m1, type = "est")

###

whether a pupil has ever repeated a class at least one grade

#
# logistic regression with random effects
#
pacman::p_load(tidyverse, lme4)

#
dta = read.table("http://140.116.183.121/~sheu/lmm/Data/grade_retention.csv",sep = ',',  h = T)

str(dta)

## 'data.frame':    8582 obs. of  6 variables:
##  $ SID     : Factor w/ 411 levels "S100101","S100102",..: 37 37 37 37 37 37 37 37 37 37 ...
##  $ PID     : Factor w/ 8582 levels "S100101P1","S100101P10",..: 662 673 674 675 676 677 678 679 680 663 ...
##  $ Gender  : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PPE     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SES     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Repeated: int  0 0 0 0 0 0 0 0 0 0 ...

head(dta)

##      SID      PID Gender PPE SES Repeated
## 1 S10101 S10101P1 Female Yes  NA        0
## 2 S10101 S10101P2 Female Yes  NA        0
## 3 S10101 S10101P3 Female Yes  NA        0
## 4 S10101 S10101P4 Female Yes  NA        0
## 5 S10101 S10101P5 Female Yes  NA        0
## 6 S10101 S10101P6 Female Yes  NA        0

#
with(dta, ftable(Gender, Repeated))

##        Repeated    0    1
## Gender                   
## Female          3750  495
## Male            3587  750

with(dta, ftable(PPE, Repeated))

##     Repeated    0    1
## PPE                   
## No           3476  769
## Yes          3861  476

with(dta, ftable(PPE, Gender, Repeated))

##            Repeated    0    1
## PPE Gender                   
## No  Female          1775  306
##     Male            1701  463
## Yes Female          1975  189
##     Male            1886  287

dta <- dta %>% 
  group_by(SID) %>%
  mutate( pr_repeat = mean(Repeated)) %>%
  as.data.frame

# histogram of proportions of repeat across schools
lattice::histogram(~ tapply(dta$Repeated, dta$SID, mean), 
                   xlab = "Proprotion of repeat")

# set plot theme
ot <- theme_set(theme_bw())

#
ggplot(data = dta, aes(x = SES, y = Repeated)) +
  stat_summary(fun.data = "mean_se", geom = "pointrange", 
               size = rel(.3),col="gray30") +
  stat_smooth(method = "glm", method.args = "binomial") +
  facet_grid(PPE ~ Gender) +
  labs(x = "Mean SES", y = "Proportion of repeat")

## Warning: Removed 1066 rows containing non-finite values (stat_summary).

## Warning: Removed 1066 rows containing non-finite values (stat_smooth).

## Warning: Removed 16 rows containing missing values (geom_pointrange).

# ignoring clusters
summary(m0 <- glm(Repeated ~ SES + Gender + PPE, data = dta, 
                  family = binomial))

## 
## Call:
## glm(formula = Repeated ~ SES + Gender + PPE, family = binomial, 
##     data = dta)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.735  -0.579  -0.519  -0.421   2.315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1.7832     0.0588  -30.34  < 2e-16
## SES          -0.2415     0.0937   -2.58     0.01
## GenderMale    0.4278     0.0676    6.32  2.5e-10
## PPEYes       -0.5688     0.0704   -8.08  6.6e-16
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6140.8  on 7515  degrees of freedom
## Residual deviance: 6009.5  on 7512  degrees of freedom
##   (1066 observations deleted due to missingness)
## AIC: 6017
## 
## Number of Fisher Scoring iterations: 4

# school as unit of cluster
summary(m1 <- glmer(Repeated ~ SES + Gender + PPE + (1 | SID), data = dta, 
                    family = binomial))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Repeated ~ SES + Gender + PPE + (1 | SID)
##    Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##     5457     5492    -2724     5447     7511 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -2.180 -0.403 -0.247 -0.168  6.092 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  SID    (Intercept) 1.62     1.27    
## Number of obs: 7516, groups:  SID, 356
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -2.2393     0.1058  -21.17  < 2e-16
## SES          -0.2969     0.2141   -1.39     0.17
## GenderMale    0.5340     0.0759    7.04  2.0e-12
## PPEYes       -0.6265     0.0999   -6.27  3.6e-10
## 
## Correlation of Fixed Effects:
##            (Intr) SES    GndrMl
## SES         0.058              
## GenderMale -0.433  0.005       
## PPEYes     -0.384 -0.108 -0.003

# individual as unit of cluster - slow
summary(m2 <- glmer(Repeated ~ SES + Gender + PPE + (1 | PID), 
                    data = dta, family = binomial))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Repeated ~ SES + Gender + PPE + (1 | PID)
##    Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##     3312     3346    -1651     3302     7511 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.00308 -0.00234 -0.00207 -0.00164  0.08875 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  PID    (Intercept) 2291     47.9    
## Number of obs: 7516, groups:  PID, 7516
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -12.192      0.374   -32.6   <2e-16
## SES           -0.251      0.500    -0.5     0.62
## GenderMale     0.453      0.353     1.3     0.20
## PPEYes        -0.603      0.371    -1.6     0.10
## 
## Correlation of Fixed Effects:
##            (Intr) SES    GndrMl
## SES         0.135              
## GenderMale -0.559  0.022       
## PPEYes     -0.380 -0.258 -0.006

###

Binary exercise

Li

2017年12月2日

PISA data 2009 from Columbia

whether a pupil has ever repeated a class at least one grade