
archival data analysis often means initial questions are reformulated to fit the data and the data are reworked in coding and recoding to fit the question better
Elder, Clipp, and Colerick (1993, p. 2)
Types of Weights:
The replicate-weight approach to estimating standard errors computes the standard deviation of the estimated summary across many partially independent subsets of the one sample, and extrapolates from this to the standard deviation between completely independent samples.
Lumley, Thomas. Complex Surveys: A Guide to Analysis Using R (Wiley Series in Survey Methodology) (Kindle Locations 797-800). Wiley. Kindle Edition.
Point: \(\sum_{R=1}^MA_r/M\)
Standard Error: \(\sqrt{ \frac{4}{M} \times \sum_{R=1}^{M} (A_r - \bar{A})^2 }\)
In PISA, the sampling variances estimated with multi-level models will always be greater than the sampling variances estimated with Fay replicate samples
Jerrim et al. (2017) - Not using the right weights is worst (non-trivial differences) - Not using replication weights is bad (wrong standard errors) - Not using plausible values is not really a big deal (PISA admits as much) - In psych and Education we tend to flip the importance of these
Normal R code can get quite messy
## [1] 6.588
Tidycode provides pipes which clean things up alot by passing objects from the left to the right.
suppressPackageStartupMessages(library(tidyverse))
iris %>%
filter(Species == 'virginica') %>%
summarise(m = mean(Sepal.Length))## m
## 1 6.588
TITLE: Example;
DATA: FILE = ~/Dropbox/Projects_Research/BFLPE_glassFloor/world/imputation1.txt;
TYPE = imputation;
VARIABLE:
NAMES = uniqueID W_FSTUWT CNT SCHOOLID SCMAT
W_FSTR6 W_FSTR7 W_FSTR8 W_FSTR9 W_FSTR10 W_FSTR11 W_FSTR12 W_FSTR13 W_FSTR14
W_FSTR15 W_FSTR16 W_FSTR17 W_FSTR18 W_FSTR19 W_FSTR20 W_FSTR21 W_FSTR22
W_FSTR23 W_FSTR24 W_FSTR25 W_FSTR26 W_FSTR27 W_FSTR28 W_FSTR29 W_FSTR30
W_FSTR31 W_FSTR32 W_FSTR33 W_FSTR34 W_FSTR35 W_FSTR36 W_FSTR37 W_FSTR38
W_FSTR39 W_FSTR40 W_FSTR41 W_FSTR42 W_FSTR43 W_FSTR44 W_FSTR45 W_FSTR46
W_FSTR47 W_FSTR48 W_FSTR49 W_FSTR50 W_FSTR51 W_FSTR52 W_FSTR53 W_FSTR54
W_FSTR55 W_FSTR56 W_FSTR57 W_FSTR58 W_FSTR59 W_FSTR60 W_FSTR61 W_FSTR62
W_FSTR63 W_FSTR64 W_FSTR65 W_FSTR66 W_FSTR67 W_FSTR68 W_FSTR69 W_FSTR70
W_FSTR71 W_FSTR72 W_FSTR73 W_FSTR74 W_FSTR75 W_FSTR76 W_FSTR77 W_FSTR78
W_FSTR79 W_FSTR80;
MISSING=.;
USEVARIABLES = MATH SCMAT;
WEIGHT = W_FSTUWT;
REPWEIGHTS = W_FSTR1-W_FSTR80;
ANALYSIS: REPSE = FAY (.5); TYPE = COMPLEX;
MODEL:
SCMAT ON MATH
OUTPUT: STDY;
## Required ---
library(survey) # the core of everything we will do today
library(mitools) # needed for working with plausible values
## optional but makes life easier ---
library(tidyverse) # a suite of tools to make coding easier
library(readit) # Figures out what sort of file you are giving it and read in the data
library(srvyr) # tidyverse for survey data
library(svyPVpack) # convience functions for survey data and plausible values
## For integrating data from statistical agencies ---
library(OECD) # grabs OECD data from the website
##F or analysis in parallel ---
library(furrr) # needed to easily set up and run models in parallel
plan(multiprocess) # prepare to run things in parallel
##Pretty graphs ---
library(ggthemes) # emulate the style of professional data vis
library(wesanderson) # colour pallets that work well togetherd2015 <- readit("~/Dropbox/Databases/WORLD/PUF_SAS_COMBINED_CMB_STU_QQQ/cy6_ms_cmb_stu_qqq.sas7bdat") %>% # load file
filter(CNT == 'AUS') %>% # Lets just use one country
rename(GENDER = ST004D01T) %>% #give us better names to work with
mutate(GENDER = ifelse(GENDER == 1, 'girl', 'boy')) # give us meaningful names## File guessed to be .sas7b*at (SAS) ("~/Dropbox/Databases/WORLD/PUF_SAS_COMBINED_CMB_STU_QQQ/cy6_ms_cmb_stu_qqq.sas7bdat")
d2015_survey %>%
group_by(GENDER) %>%
summarise(un = unweighted(n()), n = survey_total(),
m_scie = survey_mean(PV1SCIE))## # A tibble: 2 x 6
## GENDER un n n_se m_scie m_scie_se
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 boy 7367 129177. 1852. 511. 1.89
## 2 girl 7163 127152. 1768. 509. 1.67
## [,1]
## PV1MATH 493.897223
## PV1MATH_se 1.297406
## PV2MATH 494.197351
## PV2MATH_se 1.370186
## PV3MATH 492.751782
## PV3MATH_se 1.289973
## PV4MATH 495.233790
## PV4MATH_se 1.286929
## PV5MATH 493.327689
## PV5MATH_se 1.207050
## PV6MATH 494.998300
## PV6MATH_se 1.319129
## PV7MATH 493.212584
## PV7MATH_se 1.346404
## PV8MATH 492.840689
## PV8MATH_se 1.323154
## PV9MATH 494.063334
## PV9MATH_se 1.413232
## [1] 494.0883
## [1] 1.370186
## mean SE
## PV2MATH 494.2 1.3702
## [1] TRUE
## GENDER
## boy girl
## 129176.8 127152.3
## GENDER
## boy girl
## 50.39489 49.60511

# Matches 2015 PISA Report
svyPVpm(by = ~CNT, d2015_survey, pvs = paste0("PV", 1:10, "SCIE")) %>%
select(-ends_with("SE"),-Percent, -SE.Percent, -CNT, -Group1) # p 44## Number.of.cases Sum.of.weights PVSCIE_mean PVSCIE_stddev
## 1 14530 256329.1 509.9939 102.2974
svyPVpm(by = ~CNT, d2015_survey, pvs = paste0("PV", 1:10, "READ")) %>%
select(-ends_with("SE"),-Percent, -SE.Percent, -CNT, -Group1) # p 44## Number.of.cases Sum.of.weights PVREAD_mean PVREAD_stddev
## 1 14530 256329.1 502.9006 102.6853
# Matches 2015 PISA Report
svyPVpm(by = ~CNT, d2015_survey, pvs = paste0("PV", 1:10, "MATH"))%>%
select(-ends_with("SE"),-Percent, -SE.Percent, -CNT, -Group1) # p 44## Number.of.cases Sum.of.weights PVMATH_mean PVMATH_stddev
## 1 14530 256329.1 493.8962 93.05742
## $coef
## mean se t.value Pr.t
## (Intercept) 500.40362 1.454908 343.94166 < 2.22e-16
## ESCS 43.77763 1.450180 30.18773 < 2.22e-16
##
## $mod.fit
## Working.2.logLR p
## 1398.489 < 2.22e-16
pvs <- list()
mis <- list()
K = paste0("PV",1:10, "MATH")
for (i in 1:10){
mis[[i]] <- d2015 %>%
select(matches(paste0("PV", i, "[A-Z]+") ), ESCS,
GENDER, W_FSTUWT, starts_with("ST"),
starts_with("W_FSTURWT"), CNT)
names(mis[[i]]) <- gsub("PV[0-9]+", "", names(mis[[i]]))
pvs[[i]] <- mis[[i]] %>%
as_survey(type = "Fay", repweights = starts_with("W_FSTURWT"), weights = W_FSTUWT, rho = .5)
}## Multiple imputation results:
## with(mis, svymean(~SCIE, na.rm = TRUE))
## MIcombine.default(.)
## results se (lower upper) missInfo
## SCIE 509.9939 1.522534 507.0055 512.9822 10 %
## Multiple imputation results:
## MIcombine.default(.)
## results se (lower upper) missInfo
## SCIE 509.9939 1.522534 507.0055 512.9822 10 %
## [1] "<this academic year>: My parents are interested in my school activities."
pvs %>%
future_map(~ svyolr(as.factor(ST123Q01NA)~scale(SCIE), design = .)) %>%
MIcombine() %>%
summary## Multiple imputation results:
## MIcombine.default(.)
## results se (lower upper) missInfo
## scale(SCIE) 0.34697256 0.02310095 0.30157902 0.3923661 14 %
## 1|2 -4.05616768 0.07619332 -4.20550387 -3.9068315 0 %
## 2|3 -2.77425205 0.03901448 -2.85071904 -2.6977851 0 %
## 3|4 -0.02173521 0.01999755 -0.06092982 0.0174594 0 %
library(lavaan)
library(lavaan.survey)
M1 <- '
EMOSUPS =~ ST123Q01NA + ST123Q02NA + ST123Q03NA + ST123Q04NA
INSTSCIE =~ ST113Q01TA + ST113Q02TA + ST113Q03TA + ST113Q04TA
'
simple.fit <- cfa(M1, data=d2015)
summary(simple, standardized = TRUE, fit.measures = TRUE)
survey.fit <- lavaan.survey(lavaan.fit=simple.fit, survey.design=d2015_survey)
summary(survey.fit, standardized = TRUE, fit.measures = TRUE)
library(quantreg)
withReplicates(d2015_survey, quote(coef(rq(INSTSCIE~scale(PV1SCIE),
tau=0.2, weights=.weights))))## theta SE
## (Intercept) -0.7778 0.0160
## scale(PV1SCIE) 0.0000 0.0358
## theta SE
## (Intercept) 1.09162 0.0260
## scale(PV1SCIE) 0.34304 0.0132
#At top proformance band
M1 <- with(mis, svytable(~I(MATH>707.93), Ntotal=100)) %>% unlist
mean(M1[c(T,F)]); mean(M1[c(F,T)])## [1] 99.17202
## [1] 0.8279772