knitr::opts_chunk$set(echo = TRUE)
Preliminaries - set directory, install packages, load helper functions ————
# install.packages('Hmisc'); require(Hmisc)
# install.packages(Cs(haven, plyr, magrittr, dplyr, data.table, survey, epitools, mitools))
# install.packages(Cs(readr, foreign, SAScii, multcomp))
# datasets and scripts needed for the program are
# dash r program, dash html output
# basic functions.r
# yrbs.rds
# yrbs_2013_subset.rds
# 2013_SADC_SAS_Input_Program.sas.txt
library(SAScii)
library(multcomp)
library(haven); library(plyr); library(magrittr); library(dplyr)
library(data.table)
library(survey);
library(Hmisc)
library(epitools)
library(SAScii)
# .loc is "c:/users/dxf1/dropbox/R/" (windows)
# in windows, I set this in .Rprofile of C:\Users\dxf1\R-libs-pgrms\R-3.2.3\etc
# .loc on mac is '~/Dropbox/R/'
# (dir_dash <- "//cdc.gov/project/CDC_ATSDR_RUG/R_Demonstrations/YRBSS/")
# setwd(dir_dash)
getwd()
source('basic functions.r')
.loc
## [1] "~/Dropbox/R/"
# .loc is "c:/users/dxf1/dropbox/R/" (windows)
# in windows, I set this in .Rprofile of C:\Users\dxf1\R-libs-pgrms\R-3.2.3\etc
# .loc on mac is '~/Dropbox/R/'
# (dir_dash <- "//cdc.gov/project/CDC_ATSDR_RUG/R_Demonstrations/YRBSS/")
# setwd(dir_dash)
(dir_dash <- paste0(.loc,'Statistics/Survey/2016_Dash_presentation/'));
## [1] "~/Dropbox/R/Statistics/Survey/2016_Dash_presentation/"
setwd(dir_dash)
source('basic functions.r')
#----------------------------
# I used haven in the readr package to read the sas dataset ----------------
# dorig <- read_sas('sadc_2013_national.sas7bdat') %>% setDT# 7s home
# saveRDS(dorig, 'yrbs.rds')
#-----------------------------
dorig <- readRDS('yrbs.rds')
dim(dorig)
## [1] 173274 276
table(dorig$year)
##
## 1991 1993 1995 1997 1999 2001 2003 2005 2007 2009 2011 2013
## 12272 16296 10904 16262 15349 13601 15214 13917 14041 16410 15425 13583
d2013 <- subset(dorig, year==2013) # base
dim(d2013) # 13,500
## [1] 13583 276
# saveRDS(dsub, 'yrbs_2013_subset.rds')
d2013 <- readRDS('yrbs_2013_subset.rds')
Read non-SAS files ————————————————–
Load Packages for reading external files library(readr) library(SAScii) library(foreign)
DAT file the input statement is the first line begin with ‘infile datain lrecl=800;’ end with ‘;’ on separate line
.dic <- parse.SAScii('2013_SADC_SAS_Input_Program.sas.txt')
setDT(.dic); .dic # convert to data.table
## varname width char divisor
## 1: SITECODE 5 TRUE 1
## 2: SITENAME 50 TRUE 1
## 3: SITETYPE 50 TRUE 1
## 4: SITETYPENUM 8 FALSE 1
## 5: YEAR 8 FALSE 1
## ---
## 272: QNSPORTSDRINK 3 FALSE 1
## 273: QNENERGYDRINK 3 FALSE 1
## 274: QNSUGARDRINK 3 FALSE 1
## 275: QNWATER 3 FALSE 1
## 276: QNFASTFOOD 3 FALSE 1
# test <- read_fwf('sadc_2013_national.dat',
# fwf_widths(dput(.dic[,width]), col_names=(dput(.dic[,varname]))),
# progress = interactive()
# )
readr package also has read_delim data.table package has fread for large csv files (very fast) readxl for excel files
foreign package has read.dta for stata files read.xpt for sas transport files
# saveRDS(dsub,'yrbs.rds') # save it in R format
# Check dataset and data management ---------------------------------------
names(d2013) %>% head(30)
## [1] "sitecode" "sitename" "sitetype" "sitetypenum" "year"
## [6] "survyear" "weight" "stratum" "PSU" "record"
## [11] "age" "sex" "grade" "race4" "race7"
## [16] "stheight" "stweight" "bmi" "bmipct" "qnobese"
## [21] "qnowt" "qsexid" "qsexpart" "sexid" "sexid2"
## [26] "sexpart" "sexpart2" "q8" "q9" "q10"
names(d2013) <- tolower(names(d2013));
# helper 'Cs' function in Hmisc
vars <- Cs(stratum,psu,sex, bmipct,bmi,sex, race4, race7, age,stweight, weight,
grade, q45, qn43)
vars <- unique(vars); vars
## [1] "stratum" "psu" "sex" "bmipct" "bmi" "race4"
## [7] "race7" "age" "stweight" "weight" "grade" "q45"
## [13] "qn43"
vars[vars %nin% names(d2013)] # are any variables NOT in d2013 ?
## character(0)
d <- subset(d2013, select=vars)
dim(d2013); ncol(d)
## [1] 13583 276
## [1] 13
setDT(d) # make the data.frame into a (souped up) data.table
# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro-vignette.html
glimpse(d)
## Observations: 13,583
## Variables: 13
## $ stratum (dbl) 103, 112, 201, 212, 212, 112, 201, 213, 214, 214, 201...
## $ psu (dbl) 280490, 171630, 180270, 120570, 120570, 171630, 90010...
## $ sex (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2, 2,...
## $ bmipct (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ bmi (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ race4 (dbl) NA, NA, NA, NA, NA, NA, NA, 2, 1, 2, 1, NA, NA, NA, N...
## $ race7 (dbl) NA, NA, NA, NA, NA, NA, NA, 3, 6, 3, 6, NA, NA, NA, N...
## $ age (dbl) NA, NA, NA, NA, NA, NA, NA, 4, 5, 6, 6, 7, NA, NA, NA...
## $ stweight (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ weight (dbl) 0.3906, 0.6403, 0.8803, 1.5564, 1.1280, 0.3536, 2.086...
## $ grade (dbl) NA, NA, NA, NA, NA, NA, NA, 2, 2, 3, 3, 4, NA, 2, 2, ...
## $ q45 (chr) "1", "1", "1", "4", "1", "8", "1", "2", "3", "7", "1"...
## $ qn43 (dbl) 2, 2, 2, 1, 2, 1, 2, NA, 1, 1, 2, NA, 2, 2, 2, 2, 1, ...
str(d)
## Classes 'data.table' and 'data.frame': 13583 obs. of 13 variables:
## $ stratum : num 103 112 201 212 212 112 201 213 214 214 ...
## $ psu : num 280490 171630 180270 120570 120570 ...
## $ sex : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmipct : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi : num NA NA NA NA NA NA NA NA NA NA ...
## $ race4 : num NA NA NA NA NA NA NA 2 1 2 ...
## $ race7 : num NA NA NA NA NA NA NA 3 6 3 ...
## $ age : num NA NA NA NA NA NA NA 4 5 6 ...
## $ stweight: num NA NA NA NA NA NA NA NA NA NA ...
## $ weight : num 0.391 0.64 0.88 1.556 1.128 ...
## $ grade : num NA NA NA NA NA NA NA 2 2 3 ...
## $ q45 : chr "1" "1" "1" "4" ...
## $ qn43 : num 2 2 2 1 2 1 2 NA 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(d)
## stratum psu sex bmipct
## Min. :101 Min. : 10970 Min. :1.000 Min. : 0.00
## 1st Qu.:103 1st Qu.: 60890 1st Qu.:1.000 1st Qu.:42.53
## Median :113 Median :181470 Median :2.000 Median :69.16
## Mean :154 Mean :219788 Mean :1.512 Mean :63.51
## 3rd Qu.:211 3rd Qu.:290770 3rd Qu.:2.000 3rd Qu.:88.97
## Max. :214 Max. :530330 Max. :2.000 Max. :99.95
## NA's :12 NA's :1003
## bmi race4 race7 age
## Min. :13.11 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:20.30 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:4.000
## Median :22.49 Median :2.000 Median :4.000 Median :5.000
## Mean :23.64 Mean :2.061 Mean :4.659 Mean :5.157
## 3rd Qu.:25.77 3rd Qu.:3.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :55.01 Max. :4.000 Max. :7.000 Max. :7.000
## NA's :1003 NA's :318 NA's :318 NA's :77
## stweight weight grade q45
## Min. : 29.94 Min. :0.0286 Min. :1.000 Length:13583
## 1st Qu.: 56.25 1st Qu.:0.4470 1st Qu.:1.000 Class :character
## Median : 64.41 Median :0.7927 Median :3.000 Mode :character
## Mean : 67.91 Mean :1.0000 Mean :2.498
## 3rd Qu.: 76.20 3rd Qu.:1.2222 3rd Qu.:4.000
## Max. :180.99 Max. :9.2349 Max. :4.000
## NA's :1004 NA's :102
## qn43
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :1.644
## 3rd Qu.:2.000
## Max. :2.000
## NA's :1295
describe(d[,.(sex,qn43,q45,grade,stweight,age)]) # Hmisc package
## d[, .(sex, qn43, q45, grade, stweight, age)]
##
## 6 Variables 13583 Observations
## ---------------------------------------------------------------------------
## sex
## n missing unique Info Mean
## 13571 12 2 0.75 1.512
##
## 1 (6621, 49%), 2 (6950, 51%)
## ---------------------------------------------------------------------------
## qn43
## n missing unique Info Mean
## 12288 1295 2 0.69 1.644
##
## 1 (4373, 36%), 2 (7915, 64%)
## ---------------------------------------------------------------------------
## q45
## n missing unique
## 12363 1220 8
##
## 1 2 3 4 5 6 7 8
## Frequency 8028 1560 475 338 433 538 316 675
## % 65 13 4 3 4 4 3 5
## ---------------------------------------------------------------------------
## grade
## n missing unique Info Mean
## 13481 102 4 0.94 2.498
##
## 1 (3588, 27%), 2 (3152, 23%), 3 (3184, 24%), 4 (3557, 26%)
## ---------------------------------------------------------------------------
## stweight
## n missing unique Info Mean .05 .10 .25 .50
## 12579 1004 239 1 67.91 46.72 49.90 56.25 64.41
## .75 .90 .95
## 76.20 90.72 100.25
##
## lowest : 29.94 31.75 33.11 34.02 34.93
## highest: 156.95 158.76 160.12 163.30 180.99
## ---------------------------------------------------------------------------
## age
## n missing unique Info Mean
## 13506 77 7 0.95 5.157
##
## 1 2 3 4 5 6 7
## Frequency 26 18 1368 3098 3203 3473 2320
## % 0 0 10 23 24 26 17
## ---------------------------------------------------------------------------
univar(d) # my function with lots of help
## N.val N.mis Mean SD Min Med Skew Kurt P90
## stratum 13583 0 153.98 51.0 101.0 113.0 0.1 -1.9 213
## psu 13583 0 219787.55 151316.8 10970.0 181470.0 0.6 -0.7 510590
## sex 13571 12 1.51 0.5 1.0 2.0 0.0 -2.0 2
## bmipct 12580 1003 63.51 28.6 0.0 69.2 -0.6 -0.8 97
## bmi 12580 1003 23.64 5.0 13.1 22.5 1.5 3.3 30
## race4 13265 318 2.06 1.0 1.0 2.0 0.4 -1.1 4
## race7 13265 318 4.66 1.5 1.0 4.0 -0.2 -1.2 6
## age 13506 77 5.16 1.3 1.0 5.0 -0.2 -0.9 7
## stweight 12579 1004 67.91 16.9 29.9 64.4 1.2 2.3 91
## weight 13583 0 1.00 0.9 0.0 0.8 3.0 13.5 2
## grade 13481 102 2.50 1.1 1.0 3.0 0.0 -1.4 4
## qn43 12288 1295 1.64 0.5 1.0 2.0 -0.6 -1.6 2
## Max
## stratum 214
## psu 530330
## sex 2
## bmipct 100
## bmi 55
## race4 4
## race7 7
## age 7
## stweight 181
## weight 9
## grade 4
## qn43 2
# check survey design
count(d,stratum)
## Source: local data table [13 x 2]
##
## stratum n
## (dbl) (int)
## 1 103 1151
## 2 112 914
## 3 201 1597
## 4 212 791
## 5 213 1252
## 6 214 606
## 7 211 887
## 8 101 1834
## 9 111 1049
## 10 113 1004
## 11 102 1222
## 12 202 450
## 13 203 826
count(d,psu);
## Source: local data table [54 x 2]
##
## psu n
## (dbl) (int)
## 1 280490 428
## 2 171630 327
## 3 180270 258
## 4 120570 247
## 5 90010 73
## 6 60730 607
## 7 60372 207
## 8 120860 399
## 9 51430 371
## 10 10970 337
## .. ... ...
count(d,stratum,psu, sort=T)
## Source: local data table [54 x 3]
## Groups: stratum
##
## stratum psu n
## (dbl) (dbl) (int)
## 1 101 210190 285
## 2 101 210530 279
## 3 101 290770 276
## 4 101 511610 266
## 5 101 181770 262
## 6 101 260770 230
## 7 101 390170 162
## 8 101 340090 74
## 9 102 510870 420
## 10 102 516800 409
## .. ... ... ...
# data.table
d[,.N,keyby=.(stratum,psu)][1:20]
## stratum psu N
## 1: 101 181770 262
## 2: 101 210190 285
## 3: 101 210530 279
## 4: 101 260770 230
## 5: 101 290770 276
## 6: 101 340090 74
## 7: 101 390170 162
## 8: 101 511610 266
## 9: 102 121110 187
## 10: 102 281210 206
## 11: 102 510870 420
## 12: 102 516800 409
## 13: 103 10970 337
## 14: 103 280490 428
## 15: 103 370510 386
## 16: 111 130130 196
## 17: 111 180630 132
## 18: 111 211170 114
## 19: 111 271630 305
## 20: 111 340390 302
d[,.N,keyby=.(psu,stratum)][1:20]
## psu stratum N
## 1: 10970 103 337
## 2: 40131 212 187
## 3: 51430 201 371
## 4: 60010 212 242
## 5: 60290 203 585
## 6: 60372 214 207
## 7: 60530 203 241
## 8: 60670 212 115
## 9: 60710 213 379
## 10: 60730 213 607
## 11: 60890 201 175
## 12: 80750 202 316
## 13: 90010 201 73
## 14: 120570 212 247
## 15: 120690 211 194
## 16: 120860 214 399
## 17: 121110 102 187
## 18: 130130 111 196
## 19: 131510 113 185
## 20: 160510 201 259
# the PSUs have different names across the strata (unlike NHANES)
# Create new variables ----------------------------------------------------
# *current alcohol use for logistic;
# if qn43=1 then ql43=1;
# else if qn43=2 then ql43=0;
# label sex="1=m 2=f"
# binge="4+ for fem, 5+ for male";
# if qn43=1 then ql43=1;
# else if qn43=2 then ql43=0;
sapply(d,class)
## stratum psu sex bmipct bmi race4
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## race7 age stweight weight grade q45
## "numeric" "numeric" "numeric" "numeric" "numeric" "character"
## qn43
## "numeric"
d <- upData(d) # converts nummeric (length=8) to integer
## Input object size: 1416144 bytes; 13 variables 13583 observations
## New object size: 981520 bytes; 13 variables 13583 observations
sapply(d,class)
## stratum psu sex bmipct bmi race4
## "integer" "integer" "integer" "numeric" "numeric" "integer"
## race7 age stweight weight grade q45
## "integer" "integer" "numeric" "numeric" "integer" "character"
## qn43
## "integer"
ds <- d
d <- mutate(ds, q45=as.integer(q45), sex=as.integer(sex)-1, # not needed if 'upData' is used
sexc=factor(sex, labels=c('males','females')),
ql43=ifelse(qn43==2,0, qn43),
ob=ifelse(bmipct>=95, 1,
ifelse(!is.na(bmipct),0,NA) ),
racec=factor(race4,levels=c(1:4),
c('white','black','hispanic','other')) ) %>%
rename(wt=stweight)
# I took a guess at how race4 was coded !!
with(d,table(racec,race7, useNA='always'))
## race7
## racec 1 2 3 4 5 6 7 <NA>
## white 0 0 0 0 0 5449 0 0
## black 0 0 2993 0 0 0 0 0
## hispanic 0 0 0 3395 0 0 0 0
## other 121 491 0 0 135 0 681 0
## <NA> 0 0 0 0 0 0 0 318
count(d,racec,race4,race7, sort=T)
## Source: local data table [8 x 4]
## Groups: racec, race4
##
## racec race4 race7 n
## (fctr) (int) (int) (int)
## 1 white 1 6 5449
## 2 black 2 3 2993
## 3 hispanic 3 4 3395
## 4 other 4 7 681
## 5 other 4 2 491
## 6 other 4 5 135
## 7 other 4 1 121
## 8 NA NA NA 318
count(d,qn43,ql43)
## Source: local data table [3 x 3]
## Groups: qn43
##
## qn43 ql43 n
## (int) (dbl) (int)
## 1 2 0 7915
## 2 1 1 4373
## 3 NA NA 1295
# Ifthen (SAS) vs ifelse (R) statements -------------------------------------------------
# *create two-level binge drinking by sex;
# *for females, binge drinking is 4+ drinks
# *for males, binge drinking is 5+ drinks;
# if sex=1 then do; *female
# if q45 in ('1','2','3') then binge=2; *no binging;
# else if q45 in ('4','5','6','7','8') then binge=1;
# end;
# else sex=1 then do; * male
# if q45 in ('1','2','3','4',) then binge=2; *no binging;
# else if q45 in ('5','6','7','8') then binge=1;
# end;
#
# 'base R':
summary(d[,.(sexc,q45)])
## sexc q45
## males :6621 Min. :1.000
## females:6950 1st Qu.:1.000
## NA's : 12 Median :1.000
## Mean :2.178
## 3rd Qu.:2.000
## Max. :8.000
## NA's :1220
ds <- d
d <- mutate(ds, binge = ifelse(sexc=='females' & q45 %in% 1:3, 2,
ifelse(sexc=='females' & !is.na(q45), 1,
ifelse(sexc=='males' & q45 %in% 1:4, 2,
ifelse(sexc=='males' & !is.na(q45), 1, NA)
))),
binge1=ifelse(sexc=='males' & q45 < 5 | sexc=='females' & q45 < 4, 2,
ifelse(!is.na(sexc) & !is.na(q45), 1, NA)
)
)
# data table approaches
# this may be closest to SAS if-then-do statement
# but note the use of ':='
d[sexc=='females' & !is.na(q45), binge2 := ifelse(q45 < 4, 2, 1)]
d[sexc=='males' & !is.na(q45), binge2 := ifelse(q45 < 5, 2, 1)]
d[sexc=='females' & q45 < 4 | sexc=='males' & q45 <5, binge3 := 2]
d[binge2 != 2 & !is.na(q45) & !is.na(sexc), binge3:=1]
count(d,binge,binge1,binge2,binge3)
## Source: local data table [3 x 5]
## Groups: binge, binge1, binge2
##
## binge binge1 binge2 binge3 n
## (dbl) (dbl) (dbl) (dbl) (int)
## 1 NA NA NA NA 1231
## 2 2 2 2 2 10231
## 3 1 1 1 1 2121
d[,.(N=.N, minq=min(q45), maxq=max(q45)), keyby=.(sexc,binge)] # data.table
## sexc binge N minq maxq
## 1: NA NA 12 NA NA
## 2: males NA 542 NA NA
## 3: males 1 805 5 8
## 4: males 2 5274 1 4
## 5: females NA 677 NA NA
## 6: females 1 1316 4 8
## 7: females 2 4957 1 3
summary(d$bmipct)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 42.53 69.16 63.51 88.97 99.95 1003
mean(d$bmipct)
## [1] NA
mean(d$bmipct, na.rm=T)
## [1] 63.50862
# Missing values, R vs SAS ------------------------------------------------
# Be careful - missing data in SAS are often treated as very small numerbers
# in R, missing data are truly unknown
d[bmipct>=95, ob:=1]
d[bmipct<95, ob:=0]
d[,.(.N,min=min(bmipct),max=max(bmipct)),keyby=.(ob)]
## ob N min max
## 1: NA 1003 NA NA
## 2: 0 10792 0 94.99
## 3: 1 1788 95 99.95
# this would give the wrong answer in SAS
# Summarizing data frames -------------------------------------------------
summary(d[,.(sex,binge,grade,ob,racec,age,wt,weight,stratum,psu)])
## sex binge grade ob
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :1.0000 Median :2.000 Median :3.000 Median :0.0000
## Mean :0.5121 Mean :1.828 Mean :2.498 Mean :0.1421
## 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :2.000 Max. :4.000 Max. :1.0000
## NA's :12 NA's :1231 NA's :102 NA's :1003
## racec age wt weight
## white :5449 Min. :1.000 Min. : 29.94 Min. :0.0286
## black :2993 1st Qu.:4.000 1st Qu.: 56.25 1st Qu.:0.4470
## hispanic:3395 Median :5.000 Median : 64.41 Median :0.7927
## other :1428 Mean :5.157 Mean : 67.91 Mean :1.0000
## NA's : 318 3rd Qu.:6.000 3rd Qu.: 76.20 3rd Qu.:1.2222
## Max. :7.000 Max. :180.99 Max. :9.2349
## NA's :77 NA's :1004
## stratum psu
## Min. :101 Min. : 10970
## 1st Qu.:103 1st Qu.: 60890
## Median :113 Median :181470
## Mean :154 Mean :219788
## 3rd Qu.:211 3rd Qu.:290770
## Max. :214 Max. :530330
##
describe(d[,.(sex,binge,grade,racec,age,wt,weight,stratum,psu)])
## d[, .(sex, binge, grade, racec, age, wt, weight, stratum, psu)]
##
## 9 Variables 13583 Observations
## ---------------------------------------------------------------------------
## sex
## n missing unique Info Sum Mean
## 13571 12 2 0.75 6950 0.5121
## ---------------------------------------------------------------------------
## binge
## n missing unique Info Mean
## 12352 1231 2 0.43 1.828
##
## 1 (2121, 17%), 2 (10231, 83%)
## ---------------------------------------------------------------------------
## grade
## n missing unique Info Mean
## 13481 102 4 0.94 2.498
##
## 1 (3588, 27%), 2 (3152, 23%), 3 (3184, 24%), 4 (3557, 26%)
## ---------------------------------------------------------------------------
## racec
## n missing unique
## 13265 318 4
##
## white (5449, 41%), black (2993, 23%)
## hispanic (3395, 26%), other (1428, 11%)
## ---------------------------------------------------------------------------
## age
## n missing unique Info Mean
## 13506 77 7 0.95 5.157
##
## 1 2 3 4 5 6 7
## Frequency 26 18 1368 3098 3203 3473 2320
## % 0 0 10 23 24 26 17
## ---------------------------------------------------------------------------
## wt
## n missing unique Info Mean .05 .10 .25 .50
## 12579 1004 239 1 67.91 46.72 49.90 56.25 64.41
## .75 .90 .95
## 76.20 90.72 100.25
##
## lowest : 29.94 31.75 33.11 34.02 34.93
## highest: 156.95 158.76 160.12 163.30 180.99
## ---------------------------------------------------------------------------
## weight
## n missing unique Info Mean .05 .10 .25 .50
## 13583 0 2348 1 1 0.2423 0.2860 0.4470 0.7927
## .75 .90 .95
## 1.2222 1.8472 2.6882
##
## lowest : 0.0286 0.0297 0.0309 0.0313 0.0316
## highest: 7.5711 7.9843 8.4132 9.0668 9.2349
## ---------------------------------------------------------------------------
## stratum
## n missing unique Info Mean .05 .10 .25 .50
## 13583 0 13 0.99 154 101 101 103 113
## .75 .90 .95
## 211 213 213
##
## 101 102 103 111 112 113 201 202 203 211 212 213 214
## Frequency 1834 1222 1151 1049 914 1004 1597 450 826 887 791 1252 606
## % 14 9 8 8 7 7 12 3 6 7 6 9 4
## ---------------------------------------------------------------------------
## psu
## n missing unique Info Mean .05 .10 .25 .50
## 13583 0 54 1 219788 51430 60290 60890 181470
## .75 .90 .95
## 290770 510590 511610
##
## lowest : 10970 40131 51430 60010 60290
## highest: 510590 510870 511610 516800 530330
## ---------------------------------------------------------------------------
univar(d[,.(sex,binge,grade,racec,age,wt,weight,stratum,psu)])
## N.val N.mis Mean SD Min Med Skew Kurt P90
## sex 13571 12 0.51 0.5 0.0 1.0 0.0 -2.0 1
## binge 12352 1231 1.83 0.4 1.0 2.0 -1.7 1.0 2
## grade 13481 102 2.50 1.1 1.0 3.0 0.0 -1.4 4
## age 13506 77 5.16 1.3 1.0 5.0 -0.2 -0.9 7
## wt 12579 1004 67.91 16.9 29.9 64.4 1.2 2.3 91
## weight 13583 0 1.00 0.9 0.0 0.8 3.0 13.5 2
## stratum 13583 0 153.98 51.0 101.0 113.0 0.1 -1.9 213
## psu 13583 0 219787.55 151316.8 10970.0 181470.0 0.6 -0.7 510590
## Max
## sex 1
## binge 2
## grade 4
## age 7
## wt 181
## weight 9
## stratum 214
## psu 530330
v=Cs(wt,weight,stratum,psu)
univar(d[, v, with=F]) # because it's a data.table
## N.val N.mis Mean SD Min Med Skew Kurt P90
## wt 12579 1004 67.91 16.9 29.9 64.4 1.2 2.3 91
## weight 13583 0 1.00 0.9 0.0 0.8 3.0 13.5 2
## stratum 13583 0 153.98 51.0 101.0 113.0 0.1 -1.9 213
## psu 13583 0 219787.55 151316.8 10970.0 181470.0 0.6 -0.7 510590
## Max
## wt 181
## weight 9
## stratum 214
## psu 530330
# Setup for survey analyses -----------------------------------------------
# for comprision of sudaan, sas, stata, R, etc:
# http://web.pdx.edu/~wiedrick/STAT501_MASTER.pdf
# https://cran.r-project.org/web/packages/survey/index.html
# https://cran.r-project.org/web/packages/survey/survey.pdf
options(survey.adjust.domain.lonely=T, survey.lonely.psu="adjust");
I think this leads to slightly different SEs than Sudaan, but a different option can give identical results
Handling of strata with a single PSU that are not certainty PSUs is controlled by options(“survey.lonely.psu”). The default setting is “fail”, which gives an error. Use “remove” to ignore that PSU for variance computation, “adjust” to center the stratum at the population mean rather than the stratum mean, and “average” to replace the variance contribution of the stratum by the average variance contribution across strata. As of version 3.4-2 as.svrepdesign also uses this option.
The variance formulas for domain estimation give well-defined, positive results when a stratum contains only one PSU with observations in the domain, but are not unbiased.
If options(“survey.adjust.domain.lonely”) is TRUE and options(“survey.lonely.psu”) is “average” or “adjust” the same adjustment for lonely PSUs will be used within a domain.
http://r-survey.r-forge.r-project.org/survey/html/surveyoptions.html http://faculty.washington.edu/tlumley/old-survey/exmample-lonely.html
.des <- svydesign(id=~psu, strata=~stratum, data=d, nest=T, weights=~weight)
# marries design information to data set
# id to specify sampling units (primary sampling unit, the cluster variable)
# strata to specify strata
# weights to specify sampling weights
# nest=True to specify clusters are only numbered uniquely within a stratum
.des <- subset(.des, !is.na(sexc)) # subpop statement
# d_males <- subset(d, sexc=='males) followed by svydesign # Wrong
.des_m=subset(.des, sexc=='males')
summary(.des)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1083 0.8182 1.2620 2.0340 2.2370 34.9700
## Stratum Sizes:
## 101 102 103 111 112 113 201 202 203 211 212 213 214
## obs 1834 1222 1150 1049 912 1004 1593 450 826 887 789 1251 604
## design.PSU 8 4 3 5 3 3 8 2 2 6 4 4 2
## actual.PSU 8 4 3 5 3 3 8 2 2 6 4 4 2
## Data variables:
## [1] "stratum" "psu" "sex" "bmipct" "bmi" "race4" "race7"
## [8] "age" "wt" "weight" "grade" "q45" "qn43" "sexc"
## [15] "ql43" "ob" "racec" "binge" "binge1" "binge2" "binge3"
attributes(.des)
## $names
## [1] "cluster" "strata" "has.strata" "prob" "allprob"
## [6] "call" "variables" "fpc" "pps"
##
## $class
## [1] "survey.design2" "survey.design"
head(.des$var)
## stratum psu sex bmipct bmi race4 race7 age wt weight grade q45 qn43
## 1: 201 51430 1 NA NA NA NA NA NA 0.1910 NA 1 2
## 2: 103 10970 1 NA NA NA NA NA NA 1.1541 2 1 2
## 3: 103 10970 1 NA NA NA NA NA NA 1.1541 2 1 2
## 4: 211 530330 1 NA NA NA NA NA NA 0.7477 NA 1 2
## 5: 101 181770 1 NA NA NA NA NA NA 0.8093 NA NA 1
## 6: 112 170310 1 NA NA NA NA NA NA 1.0887 NA 2 1
## sexc ql43 ob racec binge binge1 binge2 binge3
## 1: females 0 NA NA 2 2 2 2
## 2: females 0 NA NA 2 2 2 2
## 3: females 0 NA NA 2 2 2 2
## 4: females 0 NA NA 2 2 2 2
## 5: females 1 NA NA NA NA NA NA
## 6: females 1 NA NA 2 2 2 2
# All the options for the survey package have names beginning with "survey".
# Three of them control standard error estimation.
# Basic Survey Functions --------------------------------------------------
mean(d$bmi,na.rm=T)
## [1] 23.64326
sqrt(var(d$bmi, na.rm=T)/length(!is.na(d$bmi))) # SE of bmi
## [1] 0.04302394
weighted.mean(d$bmi, w=d$weight, na.rm=T)
## [1] 23.56754
svymean(~bmi+bmipct+sexc, design = .des, na.rm=T)
## mean SE
## bmi 23.56754 0.0994
## bmipct 63.49078 0.5124
## sexcmales 0.50068 0.0064
## sexcfemales 0.49932 0.0064
# enter variables as formula (right side)
# stratified weighted means - data.table package
d[,.(.N,mbmi=mean(bmi,na.rm=T),mgrad=mean(grade,na.rm=T)), keyby=.(sexc)]
## sexc N mbmi mgrad
## 1: NA 12 NaN 2.800000
## 2: males 6621 23.50277 2.494986
## 3: females 6950 23.77833 2.500145
d[,.(.N,mbmi=wtd.mean(bmi,weight=weight,na.rm=T), mgrad=mean(grade,weight=weight,na.rm=T)),
keyby=.(sexc)]
## Warning in `[.data.table`(d, , .(.N, mbmi = wtd.mean(bmi, weight =
## weight, : Unable to optimize call to mean() and could be very slow. You
## must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is
## taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet
## optimized.
## sexc N mbmi mgrad
## 1: NA 12 NaN 2.800000
## 2: males 6621 23.37976 2.494986
## 3: females 6950 23.75584 2.500145
# ?svymean
x <- svymean(~bmi, design=.des, na.rm=T)
args(svymean)
## function (x, design, na.rm = FALSE, ...)
## NULL
x <- svymean(~bmi, .des, na.rm=T)
str(x)
## Class 'svystat' atomic [1:1] 23.6
## ..- attr(*, "var")= num [1, 1] 0.00987
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "bmi"
## .. .. ..$ : chr "bmi"
## ..- attr(*, "statistic")= chr "mean"
coef(x)
## bmi
## 23.56754
SE(x);
## bmi
## bmi 0.09936078
vcov(x)
## bmi
## bmi 0.009872565
sqrt(vcov(x))
## bmi
## bmi 0.09936078
x %>% confint
## 2.5 % 97.5 %
## bmi 23.3728 23.76229
# simple 'by' processing
svymean(~bmi, .des, na.rm=T)
## mean SE
## bmi 23.568 0.0994
svyby(~bmi, by=~sexc+racec, FUN=svymean, design=.des, na.rm=T)
## Warning in onestrat(x[index, , drop = FALSE], clusters[index], nPSU[index]
## [1], : Stratum (202) has only one PSU at stage 1
## sexc racec bmi se
## males.white males white 23.06087 0.1848074
## females.white females white 23.75419 0.1685017
## males.black males black 24.69301 0.2473160
## females.black females black 23.65762 0.1896313
## males.hispanic males hispanic 23.70175 0.1884616
## females.hispanic females hispanic 24.12637 0.2465299
## males.other males other 22.64588 0.2180620
## females.other females other 23.08456 0.2300261
# warnings about 1 psu in some strata
args(svyby)
## function (formula, by, design, ...)
## NULL
# Survey quantiles and writing to excel -------------------------------
quantile(d$bmi, prob=c(10,50,90)/100, na.rm=T )
## 10% 50% 90%
## 18.6119 22.4940 30.3117
svyquantile(~bmi, ci=T, quantiles=c(10,50,90)/100, des=.des, na.rm=T)
## $quantiles
## 0.1 0.5 0.9
## bmi 18.6055 22.4512 30.1211
##
## $CIs
## , , bmi
##
## 0.1 0.5 0.9
## (lower 18.4807 22.2768 29.77710
## upper) 18.6927 22.5375 30.65927
# write to clipboard for excel file
x <- svyby(~bmi, by=~sexc+racec, FUN=svymean, design=.des, na.rm=T)
## Warning in onestrat(x[index, , drop = FALSE], clusters[index], nPSU[index]
## [1], : Stratum (202) has only one PSU at stage 1
x
## sexc racec bmi se
## males.white males white 23.06087 0.1848074
## females.white females white 23.75419 0.1685017
## males.black males black 24.69301 0.2473160
## females.black females black 23.65762 0.1896313
## males.hispanic males hispanic 23.70175 0.1884616
## females.hispanic females hispanic 24.12637 0.2465299
## males.other males other 22.64588 0.2180620
## females.other females other 23.08456 0.2300261
x2 <- x %>% dcast(racec~sexc, value.var='bmi') # reshape from long to wide
x2
## racec males females
## 1 white 23.06087 23.75419
## 2 black 24.69301 23.65762
## 3 hispanic 23.70175 24.12637
## 4 other 22.64588 23.08456
wexcel(x2) # copy to clipboard for pasting into excel
# Examples from Emily's sudaan code ---------------------------------------
# Frequencies of binge drinking by sex; -----------------
# PROC CROSSTAB DATA=one;
# NEST stratum psu;
# WEIGHT weight;
# CLASS sex binge;
# TABLE sex * binge;
# SETENV decwidth=1;
# TITLE "binge drinking by sex";
# RUN;
table(d$sexc,d$racec, useNA='always')
##
## white black hispanic other <NA>
## males 2603 1497 1666 734 121
## females 2844 1494 1729 694 189
## <NA> 2 2 0 0 8
svytable(~sexc+binge, .des)
## binge
## sexc 1 2
## males 927.1389 5363.4408
## females 1330.5284 4827.4435
svytable(~sexc+binge, Ntotal=100, .des)
## binge
## sexc 1 2
## males 7.447765 43.084858
## females 10.688219 38.779158
svytable(~sexc+binge, .des) %>% prop.table(1)
## binge
## sexc 1 2
## males 0.1473853 0.8526147
## females 0.2160660 0.7839340
# Among male students, Frequency and CHISQ of current alcohol use --------
# PROC CROSSTAB DATA=one;
# NEST stratum psu;
# WEIGHT weight;
# CLASS race qn43;
# SUBPOPN sex=2;
# TABLE race * qn43;
# TEST chisq;
# TITLE "current alcohol by race";
# RUN;
svytable(~qn43+racec, subset(.des, sexc=='males')) %>% prop.table(2)
## racec
## qn43 white black hispanic other
## 1 0.3572792 0.3129768 0.3965331 0.3124031
## 2 0.6427208 0.6870232 0.6034669 0.6875969
100*svytable(~qn43+racec, subset(.des, sexc=='males')) %>% prop.table(2)
## racec
## qn43 white black hispanic other
## 1 35.72792 31.29768 39.65331 31.24031
## 2 64.27208 68.70232 60.34669 68.75969
svychisq(~qn43+racec, .des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~qn43 + racec, .des)
## F = 4.0999, ndf = 2.5342, ddf = 103.9000, p-value = 0.01243
# Pairwise t-tests: current alcohol use within sex between race; ---------
# *i.e., among males, white vs black vs hispanic;
# PROC descript DATA=one;
# WEIGHT weight;
# nest stratum psu;
# var qn43 ;
# catLEVEL 1;
# subgroup sex race;
# levels 2 4;
# pairwise race;
# run;
pairwise.t.test(d$qn43,d$racec, p.adjust='none')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: d$qn43 and d$racec
##
## white black hispanic
## black 1.6e-09 - -
## hispanic 0.083 4.5e-12 -
## other 1.0e-06 0.857 7.7e-09
##
## P value adjustment method: none
pairwise.t.test(d$qn43,d$racec, p.adjust='bonf')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: d$qn43 and d$racec
##
## white black hispanic
## black 9.6e-09 - -
## hispanic 0.5 2.7e-11 -
## other 6.2e-06 1.0 4.6e-08
##
## P value adjustment method: bonferroni
# doesn't account for survey design
m <- svyby(~qn43, ~racec, FUN=svymean, .des, na.rm=T); m
## racec qn43 se
## white white 1.636890 0.01632279
## black black 1.704619 0.01666789
## hispanic hispanic 1.625063 0.02107369
## other other 1.699158 0.02014947
Can’t put ttest in a svyby. Also, no builtin function for pairwise in survey. There is a ‘contrast’ statement in R.
http://www.ats.ucla.edu/stat/r/faq/testing_contrasts.htm
Predictive margins for generalised linear models in R
https://gist.github.com/tslumley/2e74cd0ac12a671d2724
# What I would do:
.des_m=subset(.des, sexc=='males')
## update function for the design object
.des_m <- update(.des_m, qn43.01=ifelse(qn43==2,0,1))
m <- svyglm(qn43.01~ racec, .des_m) # white is reference group
summary(m);
##
## Call:
## svyglm(formula = qn43.01 ~ racec, .des_m)
##
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35728 0.02089 17.099 <2e-16 ***
## racecblack -0.04430 0.03392 -1.306 0.199
## racechispanic 0.03925 0.03036 1.293 0.204
## racecother -0.04488 0.03204 -1.400 0.169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23162)
##
## Number of Fisher Scoring iterations: 2
# just change the reference group over and over --------------
svyglm(qn43.01~ relevel(racec, ref='black'), .des_m) %>% summary
##
## Call:
## svyglm(formula = qn43.01 ~ relevel(racec, ref = "black"), .des_m)
##
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.3129768 0.0223167 14.024
## relevel(racec, ref = "black")white 0.0443024 0.0339159 1.306
## relevel(racec, ref = "black")hispanic 0.0835564 0.0370618 2.255
## relevel(racec, ref = "black")other -0.0005737 0.0396747 -0.014
## Pr(>|t|)
## (Intercept) <2e-16 ***
## relevel(racec, ref = "black")white 0.199
## relevel(racec, ref = "black")hispanic 0.030 *
## relevel(racec, ref = "black")other 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23162)
##
## Number of Fisher Scoring iterations: 2
svyglm(qn43.01~ relevel(racec, ref='other'), .des_m) %>% summary
##
## Call:
## svyglm(formula = qn43.01 ~ relevel(racec, ref = "other"), .des_m)
##
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.3124031 0.0257111 12.151
## relevel(racec, ref = "other")white 0.0448761 0.0320440 1.400
## relevel(racec, ref = "other")black 0.0005737 0.0396747 0.014
## relevel(racec, ref = "other")hispanic 0.0841301 0.0345634 2.434
## Pr(>|t|)
## (Intercept) 1.17e-14 ***
## relevel(racec, ref = "other")white 0.1695
## relevel(racec, ref = "other")black 0.9885
## relevel(racec, ref = "other")hispanic 0.0197 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23162)
##
## Number of Fisher Scoring iterations: 2
# write a simple (means DF can do it) function:
x=svyttest(qn43~racec,
subset(.des_m, racec %in% Cs(white,black)), na.rm=T)
x
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = 1.3062, df = 40, p-value = 0.1989
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## 0.04430241
str(x)
## List of 8
## $ statistic : Named num 1.31
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 40
## ..- attr(*, "names")= chr "df"
## $ estimate : Named num 0.0443
## ..- attr(*, "names")= chr "difference in mean"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in mean"
## $ alternative: chr "two.sided"
## $ method : chr "Design-based t-test"
## $ data.name : chr "qn43 ~ racec"
## $ p.value : Named num 0.199
## ..- attr(*, "names")= chr "racecblack"
## - attr(*, "class")= chr "htest"
combn(6,2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 1 1 1 1 2 2 2 2 3 3 3 4
## [2,] 2 3 4 5 6 3 4 5 6 4 5 6 5
## [,14] [,15]
## [1,] 4 5
## [2,] 6 6
combos=combn(levels(.des$var$racec),2) %>% t %>% data.frame %>%
rename(r1=X1, r2=X2) %>%
mutate(r1=as.character(r1), r2=as.character(r2))
combos
## r1 r2
## 1 white black
## 2 white hispanic
## 3 white other
## 4 black hispanic
## 5 black other
## 6 hispanic other
# define a function that will do the comparison for each row of combos
f.pairwise=function(i){
.temp=update(.des_m, r1=combos[i,1], r2=combos[i,2])
svyttest(qn43~racec, subset(.temp, racec==r1 | racec==r2),
na.rm=T)
}
out=lapply(1:6, f.pairwise); out
## [[1]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = 1.3062, df = 40, p-value = 0.1989
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## 0.04430241
##
##
## [[2]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = -1.2931, df = 40, p-value = 0.2034
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## -0.03925396
##
##
## [[3]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = 1.4005, df = 40, p-value = 0.1691
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## 0.04487611
##
##
## [[4]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = -2.2545, df = 40, p-value = 0.02971
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## -0.08355636
##
##
## [[5]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = 0.01446, df = 40, p-value = 0.9885
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## 0.0005737011
##
##
## [[6]]
##
## Design-based t-test
##
## data: qn43 ~ racec
## t = 2.4341, df = 40, p-value = 0.01949
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean
## 0.08413006
names(out) <- paste(combos$r1,combos$r2, sep='_')
names(out)
## [1] "white_black" "white_hispanic" "white_other" "black_hispanic"
## [5] "black_other" "hispanic_other"
out2 <- sapply(out,'[','p.value') %>%
ldply %>% setnames(.,Cs(contrast,p.value))
out2
## contrast p.value
## 1 white_black.p.value 0.19893185
## 2 white_hispanic.p.value 0.20339570
## 3 white_other.p.value 0.16908779
## 4 black_hispanic.p.value 0.02970917
## 5 black_other.p.value 0.98853478
## 6 hispanic_other.p.value 0.01948833
# multicomp package --------------
# install.packages('multicomp) # multiple comparisons
library(multcomp)
pw <- summary(glht(m, mcp(racec="Tukey"))) # Adjusted p-values and SEs
# might give a warning
summary(m) # black vs white, p=0.20 in regression model
##
## Call:
## svyglm(formula = qn43.01 ~ racec, .des_m)
##
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35728 0.02089 17.099 <2e-16 ***
## racecblack -0.04430 0.03392 -1.306 0.199
## racechispanic 0.03925 0.03036 1.293 0.204
## racecother -0.04488 0.03204 -1.400 0.169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23162)
##
## Number of Fisher Scoring iterations: 2
regTermTest(m,'racec')
## Wald test for racec
## in svyglm(formula = qn43.01 ~ racec, .des_m)
## F = 2.754677 on 3 and 38 df: p= 0.055705
pw # black vs white, p=0.56 from multicomp
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: svyglm(formula = qn43.01 ~ racec, .des_m)
##
## Linear Hypotheses:
## Estimate Std. Error z value Pr(>|z|)
## black - white == 0 -0.0443024 0.0339159 -1.306 0.5556
## hispanic - white == 0 0.0392540 0.0303565 1.293 0.5641
## other - white == 0 -0.0448761 0.0320440 -1.400 0.4957
## hispanic - black == 0 0.0835564 0.0370618 2.255 0.1077
## other - black == 0 -0.0005737 0.0396747 -0.014 1.0000
## other - hispanic == 0 -0.0841301 0.0345634 -2.434 0.0701 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
par(mar=c(5,8,5,0.5))
plot(pw)
# Regression --------------------------------------------------------------
# replace glm with svyglm
# replace data='data' with design='design'
# base functions
glm(bmi~sex + racec + age, data=d) %>% summary
##
## Call:
## glm(formula = bmi ~ sex + racec + age, data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -12.161 -3.328 -1.137 2.126 30.822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.57174 0.20184 101.923 < 2e-16 ***
## sex 0.22397 0.08915 2.512 0.012 *
## racecblack 0.86943 0.11742 7.405 1.40e-13 ***
## racechispanic 0.52597 0.11193 4.699 2.64e-06 ***
## racecother -0.24337 0.15253 -1.596 0.111
## age 0.51462 0.03570 14.416 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.51479)
##
## Null deviance: 310397 on 12369 degrees of freedom
## Residual deviance: 303101 on 12364 degrees of freedom
## (1213 observations deleted due to missingness)
## AIC: 74688
##
## Number of Fisher Scoring iterations: 2
glm(bmi~sex * racec + age, data=d) %>% summary # same as next line
##
## Call:
## glm(formula = bmi ~ sex * racec + age, data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.596 -3.308 -1.098 2.104 30.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.38394 0.20762 98.180 < 2e-16 ***
## sex 0.62313 0.13726 4.540 5.69e-06 ***
## racecblack 1.63125 0.16672 9.784 < 2e-16 ***
## racechispanic 0.63148 0.16021 3.942 8.14e-05 ***
## racecother -0.15035 0.21281 -0.707 0.480
## age 0.51086 0.03565 14.331 < 2e-16 ***
## sex:racecblack -1.51211 0.23451 -6.448 1.18e-10 ***
## sex:racechispanic -0.20019 0.22352 -0.896 0.370
## sex:racecother -0.16228 0.30457 -0.533 0.594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.43231)
##
## Null deviance: 310397 on 12369 degrees of freedom
## Residual deviance: 302008 on 12361 degrees of freedom
## (1213 observations deleted due to missingness)
## AIC: 74649
##
## Number of Fisher Scoring iterations: 2
glm(bmi~sex + racec + sex:racec + age, data=d) %>% summary
##
## Call:
## glm(formula = bmi ~ sex + racec + sex:racec + age, data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.596 -3.308 -1.098 2.104 30.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.38394 0.20762 98.180 < 2e-16 ***
## sex 0.62313 0.13726 4.540 5.69e-06 ***
## racecblack 1.63125 0.16672 9.784 < 2e-16 ***
## racechispanic 0.63148 0.16021 3.942 8.14e-05 ***
## racecother -0.15035 0.21281 -0.707 0.480
## age 0.51086 0.03565 14.331 < 2e-16 ***
## sex:racecblack -1.51211 0.23451 -6.448 1.18e-10 ***
## sex:racechispanic -0.20019 0.22352 -0.896 0.370
## sex:racecother -0.16228 0.30457 -0.533 0.594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.43231)
##
## Null deviance: 310397 on 12369 degrees of freedom
## Residual deviance: 302008 on 12361 degrees of freedom
## (1213 observations deleted due to missingness)
## AIC: 74649
##
## Number of Fisher Scoring iterations: 2
# main effects and interaction
glm(bmi~sex * racec * age, data=d) %>% summary
##
## Call:
## glm(formula = bmi ~ sex * racec * age, data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.876 -3.309 -1.107 2.107 30.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.49151 0.42431 48.294 < 2e-16 ***
## sex 0.16332 0.59938 0.272 0.785258
## racecblack 2.45186 0.71482 3.430 0.000605 ***
## racechispanic 0.23788 0.67050 0.355 0.722758
## racecother 0.91216 0.90226 1.011 0.312049
## age 0.48986 0.08056 6.081 1.23e-09 ***
## sex:racecblack -2.81257 1.00674 -2.794 0.005218 **
## sex:racechispanic 0.43379 0.94707 0.458 0.646938
## sex:racecother -0.30356 1.28178 -0.237 0.812797
## sex:age 0.08759 0.11213 0.781 0.434722
## racecblack:age -0.15750 0.13440 -1.172 0.241263
## racechispanic:age 0.07722 0.12763 0.605 0.545160
## racecother:age -0.21066 0.17307 -1.217 0.223546
## sex:racecblack:age 0.24978 0.18780 1.330 0.183540
## sex:racechispanic:age -0.12216 0.17845 -0.685 0.493649
## sex:racecother:age 0.03182 0.24458 0.130 0.896476
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.42834)
##
## Null deviance: 310397 on 12369 degrees of freedom
## Residual deviance: 301788 on 12354 degrees of freedom
## (1213 observations deleted due to missingness)
## AIC: 74654
##
## Number of Fisher Scoring iterations: 2
# weighted regression
glm(bmi~sex*racec+age, weight=weight, data=d) %>% summary
##
## Call:
## glm(formula = bmi ~ sex * racec + age, data = d, weights = weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -18.189 -2.866 -0.898 1.782 54.239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.14851 0.20115 100.165 < 2e-16 ***
## sex 0.64243 0.11673 5.503 3.80e-08 ***
## racecblack 1.61874 0.18517 8.742 < 2e-16 ***
## racechispanic 0.70072 0.15917 4.402 1.08e-05 ***
## racecother -0.31687 0.22183 -1.428 0.153
## age 0.56909 0.03583 15.883 < 2e-16 ***
## sex:racecblack -1.67622 0.26445 -6.339 2.40e-10 ***
## sex:racechispanic -0.26861 0.22539 -1.192 0.233
## sex:racecother -0.22503 0.31929 -0.705 0.481
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.04883)
##
## Null deviance: 306507 on 12369 degrees of freedom
## Residual deviance: 297268 on 12361 degrees of freedom
## (1213 observations deleted due to missingness)
## AIC: 78038
##
## Number of Fisher Scoring iterations: 2
m <- svyglm(bmi~sex*racec + age, design=.des)
summary(m)
##
## Call:
## svyglm(formula = bmi ~ sex * racec + age, design = .des)
##
## Survey design:
## subset(.des, !is.na(sexc))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.14851 0.34480 58.435 < 2e-16 ***
## sex 0.64243 0.18371 3.497 0.00137 **
## racecblack 1.61874 0.30757 5.263 8.49e-06 ***
## racechispanic 0.70072 0.25699 2.727 0.01017 *
## racecother -0.31687 0.26950 -1.176 0.24810
## age 0.56909 0.04612 12.340 6.54e-14 ***
## sex:racecblack -1.67622 0.32094 -5.223 9.56e-06 ***
## sex:racechispanic -0.26861 0.27224 -0.987 0.33097
## sex:racecother -0.22503 0.33284 -0.676 0.50369
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 24.03406)
##
## Number of Fisher Scoring iterations: 2
regTermTest(m, 'sex:racec') # Chunk test
## Wald test for sex:racec
## in svyglm(formula = bmi ~ sex * racec + age, design = .des)
## F = 9.105241 on 3 and 33 df: p= 0.00015543
# Logistic regression -----------------------------------------------------
# *Simple Logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex;
# model ql43 = sex ;
# run;
svytable(~sexc+ql43, Ntotal=100, design=.des)
## ql43
## sexc 0 1
## males 32.40422 17.80102
## females 32.68711 17.10764
m <- svyglm(ql43~sexc, family=quasibinomial, design=.des);
summary(m)
##
## Call:
## svyglm(formula = ql43 ~ sexc, family = quasibinomial, design = .des)
##
## Survey design:
## subset(.des, !is.na(sexc))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.59903 0.06089 -9.838 3.09e-12 ***
## sexcfemales -0.04842 0.07084 -0.684 0.498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.012319)
##
## Number of Fisher Scoring iterations: 4
exp(coef(m))
## (Intercept) sexcfemales
## 0.5493426 0.9527310
# *Multiple logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex grade race;
# model ql43 = sex grade race ;
# run;
m <- svyglm(ql43~sexc+factor(grade)+racec, family=quasibinomial, design=.des);
exp(coef(m)) %>% round(2)
## (Intercept) sexcfemales factor(grade)2 factor(grade)3 factor(grade)4
## 0.34 0.97 1.39 2.03 2.72
## racecblack racechispanic racecother
## 0.75 1.10 0.77
# Relative Risk Regression - use Poisson regression -----------------------------------------
# *if possible, predicted marginals/Risk ratios from logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex grade race;
# model ql43 = sex grade race ;
# PREDMARG race (1) / ADJRR;
# run;
library(epitools)
(.table=svytable(~racec+ql43, .des) %>% round)
## ql43
## racec 0 1
## white 4421 2520
## black 1166 489
## hispanic 1569 941
## other 740 319
or <- oddsratio(.table, method='wald')
or; names(or)
## $data
## ql43
## racec 0 1 Total
## white 4421 2520 6941
## black 1166 489 1655
## hispanic 1569 941 2510
## other 740 319 1059
## Total 7896 4269 12165
##
## $measure
## odds ratio with 95% C.I.
## racec estimate lower upper
## white 1.0000000 NA NA
## black 0.7357500 0.6549202 0.8265558
## hispanic 1.0521718 0.9573269 1.1564132
## other 0.7562736 0.6574101 0.8700045
##
## $p.value
## two-sided
## racec midp.exact fisher.exact chi.square
## white NA NA NA
## black 1.694601e-07 1.763436e-07 2.215733e-07
## hispanic 2.915234e-01 2.986148e-01 2.913325e-01
## other 7.664817e-05 8.375555e-05 8.959863e-05
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
## [1] "data" "measure" "p.value" "correction"
oddsratio(.table, method='wald')$measure
## odds ratio with 95% C.I.
## racec estimate lower upper
## white 1.0000000 NA NA
## black 0.7357500 0.6549202 0.8265558
## hispanic 1.0521718 0.9573269 1.1564132
## other 0.7562736 0.6574101 0.8700045
m <- svyglm(ql43~racec, family=quasibinomial, design=.des); exp(coef(m))
## (Intercept) racecblack racechispanic racecother
## 0.5701295 0.7352843 1.0521088 0.7547253
# m <- svyglm(I(ql43==1)~racec, family=quasibinomial, design=.des); exp(coef(m))
summary(m)
##
## Call:
## svyglm(formula = ql43 ~ racec, family = quasibinomial, design = .des)
##
## Survey design:
## subset(.des, !is.na(sexc))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.56189 0.07058 -7.961 1.28e-09 ***
## racecblack -0.30750 0.11715 -2.625 0.0124 *
## racechispanic 0.05080 0.11216 0.453 0.6532
## racecother -0.28140 0.11140 -2.526 0.0158 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.013566)
##
## Number of Fisher Scoring iterations: 4
attributes(m)
## $names
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "na.action" "call" "formula"
## [25] "terms" "data" "offset"
## [28] "control" "method" "contrasts"
## [31] "xlevels" "naive.cov" "cov.unscaled"
## [34] "survey.design"
##
## $class
## [1] "svyglm" "glm" "lm"
# poisson regression
# http://biostats.bepress.com/uwbiostat/paper293/
# PMID 12746247
.rr <- riskratio(.table, method='wald')$measure; .rr
## risk ratio with 95% C.I.
## racec estimate lower upper
## white 1.0000000 NA NA
## black 0.8138275 0.7507640 0.8821883
## hispanic 1.0326126 0.9731071 1.0957568
## other 0.8296907 0.7530813 0.9140934
m <- svyglm(ql43~racec, family=quasipoisson, design=.des); m
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
##
## Call: svyglm(formula = ql43 ~ racec, family = quasipoisson, design = .des)
##
## Coefficients:
## (Intercept) racecblack racechispanic racecother
## -1.01305 -0.20644 0.03205 -0.18812
##
## Degrees of Freedom: 12003 Total (i.e. Null); 38 Residual
## (1567 observations deleted due to missingness)
## Null Deviance: 8941
## Residual Deviance: 8911 AIC: NA
exp(coef(m)); confint(m)
## (Intercept) racecblack racechispanic racecother
## 0.3631099 0.8134764 1.0325713 0.8285142
## 2.5 % 97.5 %
## (Intercept) -1.1011556 -0.92494408
## racecblack -0.3613844 -0.05149239
## racechispanic -0.1063446 0.17044885
## racecother -0.3364529 -0.03978978
x <- cbind(epitools=.rr[,1], quasipoisson=exp(coef(m))); x
## epitools quasipoisson
## white 1.0000000 0.3631099
## black 0.8138275 0.8134764
## hispanic 1.0326126 1.0325713
## other 0.8296907 0.8285142
x[1,2] <- NA; x
## epitools quasipoisson
## white 1.0000000 NA
## black 0.8138275 0.8134764
## hispanic 1.0326126 1.0325713
## other 0.8296907 0.8285142
# I think it estimates the incidence density ratio rather than the relative risk
mOR <- svyglm(ql43~racec + sexc + grade, family=quasibinomial, design=.des);
mRR <- update(mOR, family =quasipoisson)
mOR
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
##
## Call: svyglm(formula = ql43 ~ racec + sexc + grade, family = quasibinomial,
## design = .des)
##
## Coefficients:
## (Intercept) racecblack racechispanic racecother sexcfemales
## -1.40661 -0.28889 0.09085 -0.26130 -0.02899
## grade
## 0.33820
##
## Degrees of Freedom: 11974 Total (i.e. Null); 36 Residual
## (1596 observations deleted due to missingness)
## Null Deviance: 15730
## Residual Deviance: 15300 AIC: NA
mRR
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
##
## Call: svyglm(formula = ql43 ~ racec + sexc + grade, family = quasipoisson,
## design = .des)
##
## Coefficients:
## (Intercept) racecblack racechispanic racecother sexcfemales
## -1.56994 -0.18893 0.05626 -0.16975 -0.01819
## grade
## 0.21619
##
## Degrees of Freedom: 11974 Total (i.e. Null); 36 Residual
## (1596 observations deleted due to missingness)
## Null Deviance: 8923
## Residual Deviance: 8646 AIC: NA
cbind(OR=coef(mOR), RR=coef(mRR)) %>% exp
## OR RR
## (Intercept) 0.2449728 0.2080582
## racecblack 0.7490951 0.8278423
## racechispanic 1.0951047 1.0578743
## racecother 0.7700504 0.8438794
## sexcfemales 0.9714298 0.9819695
## grade 1.4024179 1.2413328
https://gist.github.com/tslumley/2e74cd0ac12a671d2724
Second, run these marginals through the svypredmeans function written by Dr. Thomas Lumley. “For any archaeology fans out there, this function emulates the PREDMARG statement in the ancient language of SUDAAN.”
HELP http://r4stats.com/books/r4sas-spss/R for SAS and SPSS Users 2011
Complex Surveys: A Guide to Analysis Using R (Lumley) http://goo.gl/x255s2
http://r.789695.n4.nabble.com/ http://stackoverflow.com/questions/tagged/r