David Freedman R Survey Presentation

knitr::opts_chunk$set(echo = TRUE)

Preliminaries - set directory, install packages, load helper functions ————

# install.packages('Hmisc'); require(Hmisc)
# install.packages(Cs(haven, plyr, magrittr, dplyr, data.table, survey, epitools, mitools))
# install.packages(Cs(readr, foreign, SAScii, multcomp))

# datasets and scripts needed for the program are
# dash r program, dash html output
# basic functions.r
# yrbs.rds
# yrbs_2013_subset.rds
# 2013_SADC_SAS_Input_Program.sas.txt

library(SAScii)
library(multcomp)
library(haven); library(plyr); library(magrittr); library(dplyr)
library(data.table)
library(survey); 
library(Hmisc)
library(epitools)
library(SAScii)


# .loc is "c:/users/dxf1/dropbox/R/" (windows)
#  in windows, I set this in .Rprofile of C:\Users\dxf1\R-libs-pgrms\R-3.2.3\etc
# .loc on mac is '~/Dropbox/R/'

# (dir_dash <- "//cdc.gov/project/CDC_ATSDR_RUG/R_Demonstrations/YRBSS/")
# setwd(dir_dash)

getwd()
source('basic functions.r')

.loc

## [1] "~/Dropbox/R/"

# .loc is "c:/users/dxf1/dropbox/R/" (windows)
# in windows, I set this in .Rprofile of C:\Users\dxf1\R-libs-pgrms\R-3.2.3\etc
# .loc on mac is '~/Dropbox/R/'


# (dir_dash <- "//cdc.gov/project/CDC_ATSDR_RUG/R_Demonstrations/YRBSS/")
# setwd(dir_dash)

(dir_dash <- paste0(.loc,'Statistics/Survey/2016_Dash_presentation/'));

## [1] "~/Dropbox/R/Statistics/Survey/2016_Dash_presentation/"

setwd(dir_dash)
source('basic functions.r')

#----------------------------
# I used haven in the readr package to read the sas dataset  ----------------

# dorig <- read_sas('sadc_2013_national.sas7bdat') %>% setDT# 7s home
# saveRDS(dorig, 'yrbs.rds')
#-----------------------------


dorig <- readRDS('yrbs.rds')
dim(dorig)

## [1] 173274    276

table(dorig$year)

## 
##  1991  1993  1995  1997  1999  2001  2003  2005  2007  2009  2011  2013 
## 12272 16296 10904 16262 15349 13601 15214 13917 14041 16410 15425 13583

d2013 <- subset(dorig, year==2013) # base
dim(d2013) # 13,500

## [1] 13583   276

# saveRDS(dsub, 'yrbs_2013_subset.rds')

d2013 <- readRDS('yrbs_2013_subset.rds')

Read non-SAS files ————————————————–

Load Packages for reading external files library(readr) library(SAScii) library(foreign)

DAT file the input statement is the first line begin with ‘infile datain lrecl=800;’ end with ‘;’ on separate line

.dic <- parse.SAScii('2013_SADC_SAS_Input_Program.sas.txt') 
setDT(.dic); .dic # convert to data.table

##            varname width  char divisor
##   1:      SITECODE     5  TRUE       1
##   2:      SITENAME    50  TRUE       1
##   3:      SITETYPE    50  TRUE       1
##   4:   SITETYPENUM     8 FALSE       1
##   5:          YEAR     8 FALSE       1
##  ---                                  
## 272: QNSPORTSDRINK     3 FALSE       1
## 273: QNENERGYDRINK     3 FALSE       1
## 274:  QNSUGARDRINK     3 FALSE       1
## 275:       QNWATER     3 FALSE       1
## 276:    QNFASTFOOD     3 FALSE       1

# test <- read_fwf('sadc_2013_national.dat',
#             fwf_widths(dput(.dic[,width]), col_names=(dput(.dic[,varname]))),
#             progress = interactive()
#             )

readr package also has read_delim data.table package has fread for large csv files (very fast) readxl for excel files

foreign package has read.dta for stata files read.xpt for sas transport files

# saveRDS(dsub,'yrbs.rds') # save it in R format


# Check dataset and data management ---------------------------------------
names(d2013) %>% head(30)

##  [1] "sitecode"    "sitename"    "sitetype"    "sitetypenum" "year"       
##  [6] "survyear"    "weight"      "stratum"     "PSU"         "record"     
## [11] "age"         "sex"         "grade"       "race4"       "race7"      
## [16] "stheight"    "stweight"    "bmi"         "bmipct"      "qnobese"    
## [21] "qnowt"       "qsexid"      "qsexpart"    "sexid"       "sexid2"     
## [26] "sexpart"     "sexpart2"    "q8"          "q9"          "q10"

names(d2013) <- tolower(names(d2013));  

# helper 'Cs' function in Hmisc
vars <- Cs(stratum,psu,sex, bmipct,bmi,sex, race4, race7, age,stweight, weight,
            grade, q45, qn43)
vars <- unique(vars); vars

##  [1] "stratum"  "psu"      "sex"      "bmipct"   "bmi"      "race4"   
##  [7] "race7"    "age"      "stweight" "weight"   "grade"    "q45"     
## [13] "qn43"

vars[vars %nin% names(d2013)] # are any variables NOT in d2013 ?

## character(0)

d <- subset(d2013, select=vars) 
dim(d2013); ncol(d)

## [1] 13583   276

## [1] 13

setDT(d) # make the data.frame into a (souped up) data.table
# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro-vignette.html

glimpse(d)

## Observations: 13,583
## Variables: 13
## $ stratum  (dbl) 103, 112, 201, 212, 212, 112, 201, 213, 214, 214, 201...
## $ psu      (dbl) 280490, 171630, 180270, 120570, 120570, 171630, 90010...
## $ sex      (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2, 2,...
## $ bmipct   (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ bmi      (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ race4    (dbl) NA, NA, NA, NA, NA, NA, NA, 2, 1, 2, 1, NA, NA, NA, N...
## $ race7    (dbl) NA, NA, NA, NA, NA, NA, NA, 3, 6, 3, 6, NA, NA, NA, N...
## $ age      (dbl) NA, NA, NA, NA, NA, NA, NA, 4, 5, 6, 6, 7, NA, NA, NA...
## $ stweight (dbl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ weight   (dbl) 0.3906, 0.6403, 0.8803, 1.5564, 1.1280, 0.3536, 2.086...
## $ grade    (dbl) NA, NA, NA, NA, NA, NA, NA, 2, 2, 3, 3, 4, NA, 2, 2, ...
## $ q45      (chr) "1", "1", "1", "4", "1", "8", "1", "2", "3", "7", "1"...
## $ qn43     (dbl) 2, 2, 2, 1, 2, 1, 2, NA, 1, 1, 2, NA, 2, 2, 2, 2, 1, ...

str(d)

## Classes 'data.table' and 'data.frame':   13583 obs. of  13 variables:
##  $ stratum : num  103 112 201 212 212 112 201 213 214 214 ...
##  $ psu     : num  280490 171630 180270 120570 120570 ...
##  $ sex     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmipct  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ race4   : num  NA NA NA NA NA NA NA 2 1 2 ...
##  $ race7   : num  NA NA NA NA NA NA NA 3 6 3 ...
##  $ age     : num  NA NA NA NA NA NA NA 4 5 6 ...
##  $ stweight: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weight  : num  0.391 0.64 0.88 1.556 1.128 ...
##  $ grade   : num  NA NA NA NA NA NA NA 2 2 3 ...
##  $ q45     : chr  "1" "1" "1" "4" ...
##  $ qn43    : num  2 2 2 1 2 1 2 NA 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

summary(d)

##     stratum         psu              sex            bmipct     
##  Min.   :101   Min.   : 10970   Min.   :1.000   Min.   : 0.00  
##  1st Qu.:103   1st Qu.: 60890   1st Qu.:1.000   1st Qu.:42.53  
##  Median :113   Median :181470   Median :2.000   Median :69.16  
##  Mean   :154   Mean   :219788   Mean   :1.512   Mean   :63.51  
##  3rd Qu.:211   3rd Qu.:290770   3rd Qu.:2.000   3rd Qu.:88.97  
##  Max.   :214   Max.   :530330   Max.   :2.000   Max.   :99.95  
##                                 NA's   :12      NA's   :1003   
##       bmi            race4           race7            age       
##  Min.   :13.11   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:20.30   1st Qu.:1.000   1st Qu.:3.000   1st Qu.:4.000  
##  Median :22.49   Median :2.000   Median :4.000   Median :5.000  
##  Mean   :23.64   Mean   :2.061   Mean   :4.659   Mean   :5.157  
##  3rd Qu.:25.77   3rd Qu.:3.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :55.01   Max.   :4.000   Max.   :7.000   Max.   :7.000  
##  NA's   :1003    NA's   :318     NA's   :318     NA's   :77     
##     stweight          weight           grade           q45           
##  Min.   : 29.94   Min.   :0.0286   Min.   :1.000   Length:13583      
##  1st Qu.: 56.25   1st Qu.:0.4470   1st Qu.:1.000   Class :character  
##  Median : 64.41   Median :0.7927   Median :3.000   Mode  :character  
##  Mean   : 67.91   Mean   :1.0000   Mean   :2.498                     
##  3rd Qu.: 76.20   3rd Qu.:1.2222   3rd Qu.:4.000                     
##  Max.   :180.99   Max.   :9.2349   Max.   :4.000                     
##  NA's   :1004                      NA's   :102                       
##       qn43      
##  Min.   :1.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :1.644  
##  3rd Qu.:2.000  
##  Max.   :2.000  
##  NA's   :1295

describe(d[,.(sex,qn43,q45,grade,stweight,age)]) # Hmisc package

## d[, .(sex, qn43, q45, grade, stweight, age)] 
## 
##  6  Variables      13583  Observations
## ---------------------------------------------------------------------------
## sex 
##       n missing  unique    Info    Mean 
##   13571      12       2    0.75   1.512 
## 
## 1 (6621, 49%), 2 (6950, 51%) 
## ---------------------------------------------------------------------------
## qn43 
##       n missing  unique    Info    Mean 
##   12288    1295       2    0.69   1.644 
## 
## 1 (4373, 36%), 2 (7915, 64%) 
## ---------------------------------------------------------------------------
## q45 
##       n missing  unique 
##   12363    1220       8 
## 
##              1    2   3   4   5   6   7   8
## Frequency 8028 1560 475 338 433 538 316 675
## %           65   13   4   3   4   4   3   5
## ---------------------------------------------------------------------------
## grade 
##       n missing  unique    Info    Mean 
##   13481     102       4    0.94   2.498 
## 
## 1 (3588, 27%), 2 (3152, 23%), 3 (3184, 24%), 4 (3557, 26%) 
## ---------------------------------------------------------------------------
## stweight 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   12579    1004     239       1   67.91   46.72   49.90   56.25   64.41 
##     .75     .90     .95 
##   76.20   90.72  100.25 
## 
## lowest :  29.94  31.75  33.11  34.02  34.93
## highest: 156.95 158.76 160.12 163.30 180.99 
## ---------------------------------------------------------------------------
## age 
##       n missing  unique    Info    Mean 
##   13506      77       7    0.95   5.157 
## 
##            1  2    3    4    5    6    7
## Frequency 26 18 1368 3098 3203 3473 2320
## %          0  0   10   23   24   26   17
## ---------------------------------------------------------------------------

univar(d) # my function with lots of help

##          N.val N.mis      Mean       SD     Min      Med Skew Kurt    P90
## stratum  13583     0    153.98     51.0   101.0    113.0  0.1 -1.9    213
## psu      13583     0 219787.55 151316.8 10970.0 181470.0  0.6 -0.7 510590
## sex      13571    12      1.51      0.5     1.0      2.0  0.0 -2.0      2
## bmipct   12580  1003     63.51     28.6     0.0     69.2 -0.6 -0.8     97
## bmi      12580  1003     23.64      5.0    13.1     22.5  1.5  3.3     30
## race4    13265   318      2.06      1.0     1.0      2.0  0.4 -1.1      4
## race7    13265   318      4.66      1.5     1.0      4.0 -0.2 -1.2      6
## age      13506    77      5.16      1.3     1.0      5.0 -0.2 -0.9      7
## stweight 12579  1004     67.91     16.9    29.9     64.4  1.2  2.3     91
## weight   13583     0      1.00      0.9     0.0      0.8  3.0 13.5      2
## grade    13481   102      2.50      1.1     1.0      3.0  0.0 -1.4      4
## qn43     12288  1295      1.64      0.5     1.0      2.0 -0.6 -1.6      2
##             Max
## stratum     214
## psu      530330
## sex           2
## bmipct      100
## bmi          55
## race4         4
## race7         7
## age           7
## stweight    181
## weight        9
## grade         4
## qn43          2

# check survey design
count(d,stratum)

## Source: local data table [13 x 2]
## 
##    stratum     n
##      (dbl) (int)
## 1      103  1151
## 2      112   914
## 3      201  1597
## 4      212   791
## 5      213  1252
## 6      214   606
## 7      211   887
## 8      101  1834
## 9      111  1049
## 10     113  1004
## 11     102  1222
## 12     202   450
## 13     203   826

count(d,psu);

## Source: local data table [54 x 2]
## 
##       psu     n
##     (dbl) (int)
## 1  280490   428
## 2  171630   327
## 3  180270   258
## 4  120570   247
## 5   90010    73
## 6   60730   607
## 7   60372   207
## 8  120860   399
## 9   51430   371
## 10  10970   337
## ..    ...   ...

count(d,stratum,psu, sort=T)

## Source: local data table [54 x 3]
## Groups: stratum
## 
##    stratum    psu     n
##      (dbl)  (dbl) (int)
## 1      101 210190   285
## 2      101 210530   279
## 3      101 290770   276
## 4      101 511610   266
## 5      101 181770   262
## 6      101 260770   230
## 7      101 390170   162
## 8      101 340090    74
## 9      102 510870   420
## 10     102 516800   409
## ..     ...    ...   ...

# data.table
d[,.N,keyby=.(stratum,psu)][1:20]

##     stratum    psu   N
##  1:     101 181770 262
##  2:     101 210190 285
##  3:     101 210530 279
##  4:     101 260770 230
##  5:     101 290770 276
##  6:     101 340090  74
##  7:     101 390170 162
##  8:     101 511610 266
##  9:     102 121110 187
## 10:     102 281210 206
## 11:     102 510870 420
## 12:     102 516800 409
## 13:     103  10970 337
## 14:     103 280490 428
## 15:     103 370510 386
## 16:     111 130130 196
## 17:     111 180630 132
## 18:     111 211170 114
## 19:     111 271630 305
## 20:     111 340390 302

d[,.N,keyby=.(psu,stratum)][1:20]

##        psu stratum   N
##  1:  10970     103 337
##  2:  40131     212 187
##  3:  51430     201 371
##  4:  60010     212 242
##  5:  60290     203 585
##  6:  60372     214 207
##  7:  60530     203 241
##  8:  60670     212 115
##  9:  60710     213 379
## 10:  60730     213 607
## 11:  60890     201 175
## 12:  80750     202 316
## 13:  90010     201  73
## 14: 120570     212 247
## 15: 120690     211 194
## 16: 120860     214 399
## 17: 121110     102 187
## 18: 130130     111 196
## 19: 131510     113 185
## 20: 160510     201 259

# the PSUs have different names across the strata (unlike NHANES)


# Create new variables ----------------------------------------------------

# *current alcohol use for logistic;
# if qn43=1 then ql43=1;
# else if qn43=2 then ql43=0;
 
# label sex="1=m 2=f"
# binge="4+ for fem, 5+ for male";

# if qn43=1 then ql43=1;
# else if qn43=2 then ql43=0; 

sapply(d,class)

##     stratum         psu         sex      bmipct         bmi       race4 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##       race7         age    stweight      weight       grade         q45 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" "character" 
##        qn43 
##   "numeric"

d <- upData(d) # converts nummeric (length=8) to integer

## Input object size:    1416144 bytes;  13 variables    13583 observations
## New object size:  981520 bytes;   13 variables    13583 observations

sapply(d,class)

##     stratum         psu         sex      bmipct         bmi       race4 
##   "integer"   "integer"   "integer"   "numeric"   "numeric"   "integer" 
##       race7         age    stweight      weight       grade         q45 
##   "integer"   "integer"   "numeric"   "numeric"   "integer" "character" 
##        qn43 
##   "integer"

ds <- d
d <- mutate(ds, q45=as.integer(q45), sex=as.integer(sex)-1, # not needed if 'upData' is used
            sexc=factor(sex, labels=c('males','females')),
            ql43=ifelse(qn43==2,0, qn43),
            ob=ifelse(bmipct>=95, 1, 
                  ifelse(!is.na(bmipct),0,NA) ),
            racec=factor(race4,levels=c(1:4),
                   c('white','black','hispanic','other')) ) %>% 
     rename(wt=stweight)

# I took a guess at how race4 was coded !!

with(d,table(racec,race7, useNA='always'))

##           race7
## racec         1    2    3    4    5    6    7 <NA>
##   white       0    0    0    0    0 5449    0    0
##   black       0    0 2993    0    0    0    0    0
##   hispanic    0    0    0 3395    0    0    0    0
##   other     121  491    0    0  135    0  681    0
##   <NA>        0    0    0    0    0    0    0  318

count(d,racec,race4,race7, sort=T)

## Source: local data table [8 x 4]
## Groups: racec, race4
## 
##      racec race4 race7     n
##     (fctr) (int) (int) (int)
## 1    white     1     6  5449
## 2    black     2     3  2993
## 3 hispanic     3     4  3395
## 4    other     4     7   681
## 5    other     4     2   491
## 6    other     4     5   135
## 7    other     4     1   121
## 8       NA    NA    NA   318

count(d,qn43,ql43)

## Source: local data table [3 x 3]
## Groups: qn43
## 
##    qn43  ql43     n
##   (int) (dbl) (int)
## 1     2     0  7915
## 2     1     1  4373
## 3    NA    NA  1295

# Ifthen (SAS) vs ifelse (R) statements -------------------------------------------------


# *create two-level binge drinking by sex;
#       *for females, binge drinking is 4+ drinks
#       *for males, binge drinking is 5+ drinks;
# if sex=1 then do; *female
#       if q45 in ('1','2','3') then binge=2; *no binging;
#       else if q45 in ('4','5','6','7','8') then binge=1;
#       end;
# else sex=1 then do; * male
#             if q45 in ('1','2','3','4',) then binge=2; *no binging;
#       else if q45 in ('5','6','7','8') then binge=1;
#       end;
#      

# 'base R':
summary(d[,.(sexc,q45)])

##       sexc           q45       
##  males  :6621   Min.   :1.000  
##  females:6950   1st Qu.:1.000  
##  NA's   :  12   Median :1.000  
##                 Mean   :2.178  
##                 3rd Qu.:2.000  
##                 Max.   :8.000  
##                 NA's   :1220

ds <- d
d <- mutate(ds, binge = ifelse(sexc=='females' & q45 %in% 1:3, 2, 
                         ifelse(sexc=='females' & !is.na(q45), 1,
                           ifelse(sexc=='males' & q45 %in% 1:4, 2,
                             ifelse(sexc=='males' & !is.na(q45), 1, NA) 
                            ))), 
      binge1=ifelse(sexc=='males' & q45 < 5 | sexc=='females' & q45 < 4, 2,
                ifelse(!is.na(sexc) & !is.na(q45), 1, NA)
      )
)


# data table approaches
# this may be closest to SAS if-then-do statement
# but note the use of ':='

d[sexc=='females' & !is.na(q45), binge2 := ifelse(q45 < 4, 2, 1)]
d[sexc=='males' & !is.na(q45), binge2 := ifelse(q45 < 5, 2, 1)]

d[sexc=='females' & q45 < 4 | sexc=='males' & q45 <5, binge3 := 2] 
d[binge2 != 2 & !is.na(q45) & !is.na(sexc), binge3:=1]

count(d,binge,binge1,binge2,binge3)

## Source: local data table [3 x 5]
## Groups: binge, binge1, binge2
## 
##   binge binge1 binge2 binge3     n
##   (dbl)  (dbl)  (dbl)  (dbl) (int)
## 1    NA     NA     NA     NA  1231
## 2     2      2      2      2 10231
## 3     1      1      1      1  2121

d[,.(N=.N, minq=min(q45), maxq=max(q45)), keyby=.(sexc,binge)] # data.table

##       sexc binge    N minq maxq
## 1:      NA    NA   12   NA   NA
## 2:   males    NA  542   NA   NA
## 3:   males     1  805    5    8
## 4:   males     2 5274    1    4
## 5: females    NA  677   NA   NA
## 6: females     1 1316    4    8
## 7: females     2 4957    1    3

summary(d$bmipct)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   42.53   69.16   63.51   88.97   99.95    1003

mean(d$bmipct)

## [1] NA

mean(d$bmipct, na.rm=T)

## [1] 63.50862

# Missing values, R vs SAS ------------------------------------------------
# Be careful - missing data in SAS are often treated as very small numerbers
# in R, missing data are truly unknown
d[bmipct>=95, ob:=1]
d[bmipct<95, ob:=0]
d[,.(.N,min=min(bmipct),max=max(bmipct)),keyby=.(ob)]

##    ob     N min   max
## 1: NA  1003  NA    NA
## 2:  0 10792   0 94.99
## 3:  1  1788  95 99.95

# this would give the wrong answer in SAS



# Summarizing data frames -------------------------------------------------
summary(d[,.(sex,binge,grade,ob,racec,age,wt,weight,stratum,psu)])

##       sex             binge           grade             ob        
##  Min.   :0.0000   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :1.0000   Median :2.000   Median :3.000   Median :0.0000  
##  Mean   :0.5121   Mean   :1.828   Mean   :2.498   Mean   :0.1421  
##  3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :2.000   Max.   :4.000   Max.   :1.0000  
##  NA's   :12       NA's   :1231    NA's   :102     NA's   :1003    
##       racec           age              wt             weight      
##  white   :5449   Min.   :1.000   Min.   : 29.94   Min.   :0.0286  
##  black   :2993   1st Qu.:4.000   1st Qu.: 56.25   1st Qu.:0.4470  
##  hispanic:3395   Median :5.000   Median : 64.41   Median :0.7927  
##  other   :1428   Mean   :5.157   Mean   : 67.91   Mean   :1.0000  
##  NA's    : 318   3rd Qu.:6.000   3rd Qu.: 76.20   3rd Qu.:1.2222  
##                  Max.   :7.000   Max.   :180.99   Max.   :9.2349  
##                  NA's   :77      NA's   :1004                     
##     stratum         psu        
##  Min.   :101   Min.   : 10970  
##  1st Qu.:103   1st Qu.: 60890  
##  Median :113   Median :181470  
##  Mean   :154   Mean   :219788  
##  3rd Qu.:211   3rd Qu.:290770  
##  Max.   :214   Max.   :530330  
##

describe(d[,.(sex,binge,grade,racec,age,wt,weight,stratum,psu)])

## d[, .(sex, binge, grade, racec, age, wt, weight, stratum, psu)] 
## 
##  9  Variables      13583  Observations
## ---------------------------------------------------------------------------
## sex 
##       n missing  unique    Info     Sum    Mean 
##   13571      12       2    0.75    6950  0.5121 
## ---------------------------------------------------------------------------
## binge 
##       n missing  unique    Info    Mean 
##   12352    1231       2    0.43   1.828 
## 
## 1 (2121, 17%), 2 (10231, 83%) 
## ---------------------------------------------------------------------------
## grade 
##       n missing  unique    Info    Mean 
##   13481     102       4    0.94   2.498 
## 
## 1 (3588, 27%), 2 (3152, 23%), 3 (3184, 24%), 4 (3557, 26%) 
## ---------------------------------------------------------------------------
## racec 
##       n missing  unique 
##   13265     318       4 
## 
## white (5449, 41%), black (2993, 23%) 
## hispanic (3395, 26%), other (1428, 11%) 
## ---------------------------------------------------------------------------
## age 
##       n missing  unique    Info    Mean 
##   13506      77       7    0.95   5.157 
## 
##            1  2    3    4    5    6    7
## Frequency 26 18 1368 3098 3203 3473 2320
## %          0  0   10   23   24   26   17
## ---------------------------------------------------------------------------
## wt 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   12579    1004     239       1   67.91   46.72   49.90   56.25   64.41 
##     .75     .90     .95 
##   76.20   90.72  100.25 
## 
## lowest :  29.94  31.75  33.11  34.02  34.93
## highest: 156.95 158.76 160.12 163.30 180.99 
## ---------------------------------------------------------------------------
## weight 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   13583       0    2348       1       1  0.2423  0.2860  0.4470  0.7927 
##     .75     .90     .95 
##  1.2222  1.8472  2.6882 
## 
## lowest : 0.0286 0.0297 0.0309 0.0313 0.0316
## highest: 7.5711 7.9843 8.4132 9.0668 9.2349 
## ---------------------------------------------------------------------------
## stratum 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   13583       0      13    0.99     154     101     101     103     113 
##     .75     .90     .95 
##     211     213     213 
## 
##            101  102  103  111 112  113  201 202 203 211 212  213 214
## Frequency 1834 1222 1151 1049 914 1004 1597 450 826 887 791 1252 606
## %           14    9    8    8   7    7   12   3   6   7   6    9   4
## ---------------------------------------------------------------------------
## psu 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   13583       0      54       1  219788   51430   60290   60890  181470 
##     .75     .90     .95 
##  290770  510590  511610 
## 
## lowest :  10970  40131  51430  60010  60290
## highest: 510590 510870 511610 516800 530330 
## ---------------------------------------------------------------------------

univar(d[,.(sex,binge,grade,racec,age,wt,weight,stratum,psu)])

##         N.val N.mis      Mean       SD     Min      Med Skew Kurt    P90
## sex     13571    12      0.51      0.5     0.0      1.0  0.0 -2.0      1
## binge   12352  1231      1.83      0.4     1.0      2.0 -1.7  1.0      2
## grade   13481   102      2.50      1.1     1.0      3.0  0.0 -1.4      4
## age     13506    77      5.16      1.3     1.0      5.0 -0.2 -0.9      7
## wt      12579  1004     67.91     16.9    29.9     64.4  1.2  2.3     91
## weight  13583     0      1.00      0.9     0.0      0.8  3.0 13.5      2
## stratum 13583     0    153.98     51.0   101.0    113.0  0.1 -1.9    213
## psu     13583     0 219787.55 151316.8 10970.0 181470.0  0.6 -0.7 510590
##            Max
## sex          1
## binge        2
## grade        4
## age          7
## wt         181
## weight       9
## stratum    214
## psu     530330

v=Cs(wt,weight,stratum,psu)
univar(d[, v, with=F]) # because it's a data.table

##         N.val N.mis      Mean       SD     Min      Med Skew Kurt    P90
## wt      12579  1004     67.91     16.9    29.9     64.4  1.2  2.3     91
## weight  13583     0      1.00      0.9     0.0      0.8  3.0 13.5      2
## stratum 13583     0    153.98     51.0   101.0    113.0  0.1 -1.9    213
## psu     13583     0 219787.55 151316.8 10970.0 181470.0  0.6 -0.7 510590
##            Max
## wt         181
## weight       9
## stratum    214
## psu     530330

# Setup for survey analyses -----------------------------------------------
# for comprision of sudaan, sas, stata, R, etc:
# http://web.pdx.edu/~wiedrick/STAT501_MASTER.pdf

# https://cran.r-project.org/web/packages/survey/index.html
# https://cran.r-project.org/web/packages/survey/survey.pdf

options(survey.adjust.domain.lonely=T, survey.lonely.psu="adjust");

I think this leads to slightly different SEs than Sudaan, but a different option can give identical results

Handling of strata with a single PSU that are not certainty PSUs is controlled by options(“survey.lonely.psu”). The default setting is “fail”, which gives an error. Use “remove” to ignore that PSU for variance computation, “adjust” to center the stratum at the population mean rather than the stratum mean, and “average” to replace the variance contribution of the stratum by the average variance contribution across strata. As of version 3.4-2 as.svrepdesign also uses this option.

The variance formulas for domain estimation give well-defined, positive results when a stratum contains only one PSU with observations in the domain, but are not unbiased.

If options(“survey.adjust.domain.lonely”) is TRUE and options(“survey.lonely.psu”) is “average” or “adjust” the same adjustment for lonely PSUs will be used within a domain.

http://r-survey.r-forge.r-project.org/survey/html/surveyoptions.html http://faculty.washington.edu/tlumley/old-survey/exmample-lonely.html

.des <- svydesign(id=~psu, strata=~stratum, data=d, nest=T, weights=~weight)
# marries design information to data set

# id to specify sampling units (primary sampling unit, the cluster variable)
# strata to specify strata
# weights to specify sampling weights
# nest=True to specify clusters are only numbered uniquely within a stratum

.des <- subset(.des, !is.na(sexc)) # subpop statement

# d_males <- subset(d, sexc=='males) followed by svydesign # Wrong
.des_m=subset(.des, sexc=='males')

summary(.des)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
## Probabilities:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1083  0.8182  1.2620  2.0340  2.2370 34.9700 
## Stratum Sizes: 
##             101  102  103  111 112  113  201 202 203 211 212  213 214
## obs        1834 1222 1150 1049 912 1004 1593 450 826 887 789 1251 604
## design.PSU    8    4    3    5   3    3    8   2   2   6   4    4   2
## actual.PSU    8    4    3    5   3    3    8   2   2   6   4    4   2
## Data variables:
##  [1] "stratum" "psu"     "sex"     "bmipct"  "bmi"     "race4"   "race7"  
##  [8] "age"     "wt"      "weight"  "grade"   "q45"     "qn43"    "sexc"   
## [15] "ql43"    "ob"      "racec"   "binge"   "binge1"  "binge2"  "binge3"

attributes(.des)

## $names
## [1] "cluster"    "strata"     "has.strata" "prob"       "allprob"   
## [6] "call"       "variables"  "fpc"        "pps"       
## 
## $class
## [1] "survey.design2" "survey.design"

head(.des$var)

##    stratum    psu sex bmipct bmi race4 race7 age wt weight grade q45 qn43
## 1:     201  51430   1     NA  NA    NA    NA  NA NA 0.1910    NA   1    2
## 2:     103  10970   1     NA  NA    NA    NA  NA NA 1.1541     2   1    2
## 3:     103  10970   1     NA  NA    NA    NA  NA NA 1.1541     2   1    2
## 4:     211 530330   1     NA  NA    NA    NA  NA NA 0.7477    NA   1    2
## 5:     101 181770   1     NA  NA    NA    NA  NA NA 0.8093    NA  NA    1
## 6:     112 170310   1     NA  NA    NA    NA  NA NA 1.0887    NA   2    1
##       sexc ql43 ob racec binge binge1 binge2 binge3
## 1: females    0 NA    NA     2      2      2      2
## 2: females    0 NA    NA     2      2      2      2
## 3: females    0 NA    NA     2      2      2      2
## 4: females    0 NA    NA     2      2      2      2
## 5: females    1 NA    NA    NA     NA     NA     NA
## 6: females    1 NA    NA     2      2      2      2

# All the options for the survey package have names beginning with "survey". 
# Three of them control standard error estimation.

# Basic Survey Functions --------------------------------------------------
mean(d$bmi,na.rm=T)

## [1] 23.64326

sqrt(var(d$bmi, na.rm=T)/length(!is.na(d$bmi))) # SE of bmi

## [1] 0.04302394

weighted.mean(d$bmi, w=d$weight, na.rm=T)

## [1] 23.56754

svymean(~bmi+bmipct+sexc, design = .des, na.rm=T)

##                 mean     SE
## bmi         23.56754 0.0994
## bmipct      63.49078 0.5124
## sexcmales    0.50068 0.0064
## sexcfemales  0.49932 0.0064

# enter variables as formula (right side)

# stratified weighted means - data.table package
d[,.(.N,mbmi=mean(bmi,na.rm=T),mgrad=mean(grade,na.rm=T)), keyby=.(sexc)]

##       sexc    N     mbmi    mgrad
## 1:      NA   12      NaN 2.800000
## 2:   males 6621 23.50277 2.494986
## 3: females 6950 23.77833 2.500145

d[,.(.N,mbmi=wtd.mean(bmi,weight=weight,na.rm=T), mgrad=mean(grade,weight=weight,na.rm=T)), 
  keyby=.(sexc)]

## Warning in `[.data.table`(d, , .(.N, mbmi = wtd.mean(bmi, weight =
## weight, : Unable to optimize call to mean() and could be very slow. You
## must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is
## taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet
## optimized.

##       sexc    N     mbmi    mgrad
## 1:      NA   12      NaN 2.800000
## 2:   males 6621 23.37976 2.494986
## 3: females 6950 23.75584 2.500145

# ?svymean
x <- svymean(~bmi, design=.des, na.rm=T)
args(svymean)

## function (x, design, na.rm = FALSE, ...) 
## NULL

x <- svymean(~bmi, .des, na.rm=T)

str(x)

## Class 'svystat'  atomic [1:1] 23.6
##   ..- attr(*, "var")= num [1, 1] 0.00987
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr "bmi"
##   .. .. ..$ : chr "bmi"
##   ..- attr(*, "statistic")= chr "mean"

coef(x)

##      bmi 
## 23.56754

SE(x);

##            bmi
## bmi 0.09936078

vcov(x)

##             bmi
## bmi 0.009872565

sqrt(vcov(x))

##            bmi
## bmi 0.09936078

x %>% confint

##       2.5 %   97.5 %
## bmi 23.3728 23.76229

# simple 'by' processing
svymean(~bmi, .des, na.rm=T)

##       mean     SE
## bmi 23.568 0.0994

svyby(~bmi, by=~sexc+racec, FUN=svymean, design=.des, na.rm=T)

## Warning in onestrat(x[index, , drop = FALSE], clusters[index], nPSU[index]
## [1], : Stratum (202) has only one PSU at stage 1

##                     sexc    racec      bmi        se
## males.white        males    white 23.06087 0.1848074
## females.white    females    white 23.75419 0.1685017
## males.black        males    black 24.69301 0.2473160
## females.black    females    black 23.65762 0.1896313
## males.hispanic     males hispanic 23.70175 0.1884616
## females.hispanic females hispanic 24.12637 0.2465299
## males.other        males    other 22.64588 0.2180620
## females.other    females    other 23.08456 0.2300261

# warnings about 1 psu in some strata
args(svyby)

## function (formula, by, design, ...) 
## NULL

# Survey quantiles and writing to excel -------------------------------

quantile(d$bmi, prob=c(10,50,90)/100, na.rm=T )

##     10%     50%     90% 
## 18.6119 22.4940 30.3117

svyquantile(~bmi, ci=T, quantiles=c(10,50,90)/100, des=.des, na.rm=T)

## $quantiles
##         0.1     0.5     0.9
## bmi 18.6055 22.4512 30.1211
## 
## $CIs
## , , bmi
## 
##            0.1     0.5      0.9
## (lower 18.4807 22.2768 29.77710
## upper) 18.6927 22.5375 30.65927

# write to clipboard for excel file
x <- svyby(~bmi, by=~sexc+racec, FUN=svymean, design=.des, na.rm=T)

## Warning in onestrat(x[index, , drop = FALSE], clusters[index], nPSU[index]
## [1], : Stratum (202) has only one PSU at stage 1

##                     sexc    racec      bmi        se
## males.white        males    white 23.06087 0.1848074
## females.white    females    white 23.75419 0.1685017
## males.black        males    black 24.69301 0.2473160
## females.black    females    black 23.65762 0.1896313
## males.hispanic     males hispanic 23.70175 0.1884616
## females.hispanic females hispanic 24.12637 0.2465299
## males.other        males    other 22.64588 0.2180620
## females.other    females    other 23.08456 0.2300261

x2 <- x %>% dcast(racec~sexc, value.var='bmi') # reshape from long to wide
x2

##      racec    males  females
## 1    white 23.06087 23.75419
## 2    black 24.69301 23.65762
## 3 hispanic 23.70175 24.12637
## 4    other 22.64588 23.08456

wexcel(x2) # copy to clipboard for pasting into excel



# Examples from Emily's sudaan code ---------------------------------------

# Frequencies of binge drinking by sex; -----------------
# PROC CROSSTAB DATA=one;
# NEST stratum psu;
# WEIGHT weight;
# CLASS sex binge;
# TABLE sex * binge;
# SETENV decwidth=1;
# TITLE "binge drinking by sex";
# RUN;

table(d$sexc,d$racec, useNA='always')

##          
##           white black hispanic other <NA>
##   males    2603  1497     1666   734  121
##   females  2844  1494     1729   694  189
##   <NA>        2     2        0     0    8

svytable(~sexc+binge, .des)

##          binge
## sexc              1         2
##   males    927.1389 5363.4408
##   females 1330.5284 4827.4435

svytable(~sexc+binge, Ntotal=100, .des)

##          binge
## sexc              1         2
##   males    7.447765 43.084858
##   females 10.688219 38.779158

svytable(~sexc+binge, .des) %>% prop.table(1)

##          binge
## sexc              1         2
##   males   0.1473853 0.8526147
##   females 0.2160660 0.7839340

# Among male students, Frequency and CHISQ of current alcohol use --------
# PROC CROSSTAB DATA=one;
# NEST stratum psu;
# WEIGHT weight;
# CLASS race qn43;
# SUBPOPN sex=2;
# TABLE race * qn43;
# TEST chisq;
# TITLE "current alcohol by race";
# RUN;

svytable(~qn43+racec, subset(.des, sexc=='males')) %>% prop.table(2)

##     racec
## qn43     white     black  hispanic     other
##    1 0.3572792 0.3129768 0.3965331 0.3124031
##    2 0.6427208 0.6870232 0.6034669 0.6875969

100*svytable(~qn43+racec, subset(.des, sexc=='males')) %>% prop.table(2)

##     racec
## qn43    white    black hispanic    other
##    1 35.72792 31.29768 39.65331 31.24031
##    2 64.27208 68.70232 60.34669 68.75969

svychisq(~qn43+racec, .des)

## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~qn43 + racec, .des)
## F = 4.0999, ndf = 2.5342, ddf = 103.9000, p-value = 0.01243

# Pairwise t-tests: current alcohol use within sex between race; ---------
# *i.e., among males, white vs black vs hispanic;

# PROC descript DATA=one;
# WEIGHT weight;
# nest stratum psu;
# var  qn43 ;
# catLEVEL   1; 
# subgroup  sex race;
# levels  2 4;
# pairwise race;
# run;

pairwise.t.test(d$qn43,d$racec, p.adjust='none')

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  d$qn43 and d$racec 
## 
##          white   black   hispanic
## black    1.6e-09 -       -       
## hispanic 0.083   4.5e-12 -       
## other    1.0e-06 0.857   7.7e-09 
## 
## P value adjustment method: none

pairwise.t.test(d$qn43,d$racec, p.adjust='bonf')

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  d$qn43 and d$racec 
## 
##          white   black   hispanic
## black    9.6e-09 -       -       
## hispanic 0.5     2.7e-11 -       
## other    6.2e-06 1.0     4.6e-08 
## 
## P value adjustment method: bonferroni

# doesn't account for survey design

m <- svyby(~qn43, ~racec, FUN=svymean, .des, na.rm=T); m

##             racec     qn43         se
## white       white 1.636890 0.01632279
## black       black 1.704619 0.01666789
## hispanic hispanic 1.625063 0.02107369
## other       other 1.699158 0.02014947

Can’t put ttest in a svyby. Also, no builtin function for pairwise in survey. There is a ‘contrast’ statement in R.

http://www.ats.ucla.edu/stat/r/faq/testing_contrasts.htm

Predictive margins for generalised linear models in R

https://gist.github.com/tslumley/2e74cd0ac12a671d2724

# What I would do:
.des_m=subset(.des, sexc=='males')

## update function for the design object
.des_m <- update(.des_m, qn43.01=ifelse(qn43==2,0,1))

m <- svyglm(qn43.01~ racec, .des_m) # white is reference group
summary(m);

## 
## Call:
## svyglm(formula = qn43.01 ~ racec, .des_m)
## 
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.35728    0.02089  17.099   <2e-16 ***
## racecblack    -0.04430    0.03392  -1.306    0.199    
## racechispanic  0.03925    0.03036   1.293    0.204    
## racecother    -0.04488    0.03204  -1.400    0.169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23162)
## 
## Number of Fisher Scoring iterations: 2

# just change the reference group over and over --------------
svyglm(qn43.01~ relevel(racec, ref='black'), .des_m) %>% summary

## 
## Call:
## svyglm(formula = qn43.01 ~ relevel(racec, ref = "black"), .des_m)
## 
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                            0.3129768  0.0223167  14.024
## relevel(racec, ref = "black")white     0.0443024  0.0339159   1.306
## relevel(racec, ref = "black")hispanic  0.0835564  0.0370618   2.255
## relevel(racec, ref = "black")other    -0.0005737  0.0396747  -0.014
##                                       Pr(>|t|)    
## (Intercept)                             <2e-16 ***
## relevel(racec, ref = "black")white       0.199    
## relevel(racec, ref = "black")hispanic    0.030 *  
## relevel(racec, ref = "black")other       0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23162)
## 
## Number of Fisher Scoring iterations: 2

svyglm(qn43.01~ relevel(racec, ref='other'), .des_m) %>% summary

## 
## Call:
## svyglm(formula = qn43.01 ~ relevel(racec, ref = "other"), .des_m)
## 
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                           0.3124031  0.0257111  12.151
## relevel(racec, ref = "other")white    0.0448761  0.0320440   1.400
## relevel(racec, ref = "other")black    0.0005737  0.0396747   0.014
## relevel(racec, ref = "other")hispanic 0.0841301  0.0345634   2.434
##                                       Pr(>|t|)    
## (Intercept)                           1.17e-14 ***
## relevel(racec, ref = "other")white      0.1695    
## relevel(racec, ref = "other")black      0.9885    
## relevel(racec, ref = "other")hispanic   0.0197 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23162)
## 
## Number of Fisher Scoring iterations: 2

# write a simple (means DF can do it) function:
x=svyttest(qn43~racec, 
         subset(.des_m, racec %in% Cs(white,black)), na.rm=T)
x

## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = 1.3062, df = 40, p-value = 0.1989
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##         0.04430241

str(x)

## List of 8
##  $ statistic  : Named num 1.31
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 40
##   ..- attr(*, "names")= chr "df"
##  $ estimate   : Named num 0.0443
##   ..- attr(*, "names")= chr "difference in mean"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "difference in mean"
##  $ alternative: chr "two.sided"
##  $ method     : chr "Design-based t-test"
##  $ data.name  : chr "qn43 ~ racec"
##  $ p.value    : Named num 0.199
##   ..- attr(*, "names")= chr "racecblack"
##  - attr(*, "class")= chr "htest"

combn(6,2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]    1    1    1    1    1    2    2    2    2     3     3     3     4
## [2,]    2    3    4    5    6    3    4    5    6     4     5     6     5
##      [,14] [,15]
## [1,]     4     5
## [2,]     6     6

combos=combn(levels(.des$var$racec),2) %>% t %>% data.frame %>% 
   rename(r1=X1, r2=X2) %>% 
   mutate(r1=as.character(r1), r2=as.character(r2))
combos

##         r1       r2
## 1    white    black
## 2    white hispanic
## 3    white    other
## 4    black hispanic
## 5    black    other
## 6 hispanic    other

# define a function that will do the comparison for each row of combos
f.pairwise=function(i){
   .temp=update(.des_m, r1=combos[i,1], r2=combos[i,2])
   svyttest(qn43~racec, subset(.temp, racec==r1 | racec==r2), 
            na.rm=T)
}


out=lapply(1:6, f.pairwise); out

## [[1]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = 1.3062, df = 40, p-value = 0.1989
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##         0.04430241 
## 
## 
## [[2]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = -1.2931, df = 40, p-value = 0.2034
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##        -0.03925396 
## 
## 
## [[3]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = 1.4005, df = 40, p-value = 0.1691
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##         0.04487611 
## 
## 
## [[4]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = -2.2545, df = 40, p-value = 0.02971
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##        -0.08355636 
## 
## 
## [[5]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = 0.01446, df = 40, p-value = 0.9885
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##       0.0005737011 
## 
## 
## [[6]]
## 
##  Design-based t-test
## 
## data:  qn43 ~ racec
## t = 2.4341, df = 40, p-value = 0.01949
## alternative hypothesis: true difference in mean is not equal to 0
## sample estimates:
## difference in mean 
##         0.08413006

names(out) <- paste(combos$r1,combos$r2, sep='_')
names(out)

## [1] "white_black"    "white_hispanic" "white_other"    "black_hispanic"
## [5] "black_other"    "hispanic_other"

out2 <- sapply(out,'[','p.value') %>% 
  ldply %>% setnames(.,Cs(contrast,p.value))
out2

##                 contrast    p.value
## 1    white_black.p.value 0.19893185
## 2 white_hispanic.p.value 0.20339570
## 3    white_other.p.value 0.16908779
## 4 black_hispanic.p.value 0.02970917
## 5    black_other.p.value 0.98853478
## 6 hispanic_other.p.value 0.01948833

#  multicomp package --------------
# install.packages('multicomp) # multiple comparisons
library(multcomp)
pw <- summary(glht(m, mcp(racec="Tukey")))  # Adjusted p-values and SEs
# might give a warning

summary(m) # black vs white, p=0.20 in regression model

## 
## Call:
## svyglm(formula = qn43.01 ~ racec, .des_m)
## 
## Survey design:
## update(.des_m, qn43.01 = ifelse(qn43 == 2, 0, 1))
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.35728    0.02089  17.099   <2e-16 ***
## racecblack    -0.04430    0.03392  -1.306    0.199    
## racechispanic  0.03925    0.03036   1.293    0.204    
## racecother    -0.04488    0.03204  -1.400    0.169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23162)
## 
## Number of Fisher Scoring iterations: 2

regTermTest(m,'racec')

## Wald test for racec
##  in svyglm(formula = qn43.01 ~ racec, .des_m)
## F =  2.754677  on  3  and  38  df: p= 0.055705

pw # black vs white, p=0.56 from multicomp

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: svyglm(formula = qn43.01 ~ racec, .des_m)
## 
## Linear Hypotheses:
##                         Estimate Std. Error z value Pr(>|z|)  
## black - white == 0    -0.0443024  0.0339159  -1.306   0.5556  
## hispanic - white == 0  0.0392540  0.0303565   1.293   0.5641  
## other - white == 0    -0.0448761  0.0320440  -1.400   0.4957  
## hispanic - black == 0  0.0835564  0.0370618   2.255   0.1077  
## other - black == 0    -0.0005737  0.0396747  -0.014   1.0000  
## other - hispanic == 0 -0.0841301  0.0345634  -2.434   0.0701 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

par(mar=c(5,8,5,0.5)) 
plot(pw)

# Regression --------------------------------------------------------------
# replace glm with svyglm
# replace data='data' with design='design'

# base functions
glm(bmi~sex + racec + age, data=d) %>% summary

## 
## Call:
## glm(formula = bmi ~ sex + racec + age, data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -12.161   -3.328   -1.137    2.126   30.822  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   20.57174    0.20184 101.923  < 2e-16 ***
## sex            0.22397    0.08915   2.512    0.012 *  
## racecblack     0.86943    0.11742   7.405 1.40e-13 ***
## racechispanic  0.52597    0.11193   4.699 2.64e-06 ***
## racecother    -0.24337    0.15253  -1.596    0.111    
## age            0.51462    0.03570  14.416  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.51479)
## 
##     Null deviance: 310397  on 12369  degrees of freedom
## Residual deviance: 303101  on 12364  degrees of freedom
##   (1213 observations deleted due to missingness)
## AIC: 74688
## 
## Number of Fisher Scoring iterations: 2

glm(bmi~sex * racec + age, data=d) %>% summary # same as next line

## 
## Call:
## glm(formula = bmi ~ sex * racec + age, data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.596   -3.308   -1.098    2.104   30.926  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.38394    0.20762  98.180  < 2e-16 ***
## sex                0.62313    0.13726   4.540 5.69e-06 ***
## racecblack         1.63125    0.16672   9.784  < 2e-16 ***
## racechispanic      0.63148    0.16021   3.942 8.14e-05 ***
## racecother        -0.15035    0.21281  -0.707    0.480    
## age                0.51086    0.03565  14.331  < 2e-16 ***
## sex:racecblack    -1.51211    0.23451  -6.448 1.18e-10 ***
## sex:racechispanic -0.20019    0.22352  -0.896    0.370    
## sex:racecother    -0.16228    0.30457  -0.533    0.594    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.43231)
## 
##     Null deviance: 310397  on 12369  degrees of freedom
## Residual deviance: 302008  on 12361  degrees of freedom
##   (1213 observations deleted due to missingness)
## AIC: 74649
## 
## Number of Fisher Scoring iterations: 2

glm(bmi~sex + racec + sex:racec + age, data=d) %>% summary

## 
## Call:
## glm(formula = bmi ~ sex + racec + sex:racec + age, data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.596   -3.308   -1.098    2.104   30.926  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.38394    0.20762  98.180  < 2e-16 ***
## sex                0.62313    0.13726   4.540 5.69e-06 ***
## racecblack         1.63125    0.16672   9.784  < 2e-16 ***
## racechispanic      0.63148    0.16021   3.942 8.14e-05 ***
## racecother        -0.15035    0.21281  -0.707    0.480    
## age                0.51086    0.03565  14.331  < 2e-16 ***
## sex:racecblack    -1.51211    0.23451  -6.448 1.18e-10 ***
## sex:racechispanic -0.20019    0.22352  -0.896    0.370    
## sex:racecother    -0.16228    0.30457  -0.533    0.594    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.43231)
## 
##     Null deviance: 310397  on 12369  degrees of freedom
## Residual deviance: 302008  on 12361  degrees of freedom
##   (1213 observations deleted due to missingness)
## AIC: 74649
## 
## Number of Fisher Scoring iterations: 2

# main effects and interaction

glm(bmi~sex * racec * age, data=d) %>% summary

## 
## Call:
## glm(formula = bmi ~ sex * racec * age, data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.876   -3.309   -1.107    2.107   30.875  
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.49151    0.42431  48.294  < 2e-16 ***
## sex                    0.16332    0.59938   0.272 0.785258    
## racecblack             2.45186    0.71482   3.430 0.000605 ***
## racechispanic          0.23788    0.67050   0.355 0.722758    
## racecother             0.91216    0.90226   1.011 0.312049    
## age                    0.48986    0.08056   6.081 1.23e-09 ***
## sex:racecblack        -2.81257    1.00674  -2.794 0.005218 ** 
## sex:racechispanic      0.43379    0.94707   0.458 0.646938    
## sex:racecother        -0.30356    1.28178  -0.237 0.812797    
## sex:age                0.08759    0.11213   0.781 0.434722    
## racecblack:age        -0.15750    0.13440  -1.172 0.241263    
## racechispanic:age      0.07722    0.12763   0.605 0.545160    
## racecother:age        -0.21066    0.17307  -1.217 0.223546    
## sex:racecblack:age     0.24978    0.18780   1.330 0.183540    
## sex:racechispanic:age -0.12216    0.17845  -0.685 0.493649    
## sex:racecother:age     0.03182    0.24458   0.130 0.896476    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.42834)
## 
##     Null deviance: 310397  on 12369  degrees of freedom
## Residual deviance: 301788  on 12354  degrees of freedom
##   (1213 observations deleted due to missingness)
## AIC: 74654
## 
## Number of Fisher Scoring iterations: 2

# weighted regression
glm(bmi~sex*racec+age, weight=weight, data=d) %>% summary

## 
## Call:
## glm(formula = bmi ~ sex * racec + age, data = d, weights = weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -18.189   -2.866   -0.898    1.782   54.239  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.14851    0.20115 100.165  < 2e-16 ***
## sex                0.64243    0.11673   5.503 3.80e-08 ***
## racecblack         1.61874    0.18517   8.742  < 2e-16 ***
## racechispanic      0.70072    0.15917   4.402 1.08e-05 ***
## racecother        -0.31687    0.22183  -1.428    0.153    
## age                0.56909    0.03583  15.883  < 2e-16 ***
## sex:racecblack    -1.67622    0.26445  -6.339 2.40e-10 ***
## sex:racechispanic -0.26861    0.22539  -1.192    0.233    
## sex:racecother    -0.22503    0.31929  -0.705    0.481    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.04883)
## 
##     Null deviance: 306507  on 12369  degrees of freedom
## Residual deviance: 297268  on 12361  degrees of freedom
##   (1213 observations deleted due to missingness)
## AIC: 78038
## 
## Number of Fisher Scoring iterations: 2

m <- svyglm(bmi~sex*racec + age, design=.des)
summary(m)

## 
## Call:
## svyglm(formula = bmi ~ sex * racec + age, design = .des)
## 
## Survey design:
## subset(.des, !is.na(sexc))
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.14851    0.34480  58.435  < 2e-16 ***
## sex                0.64243    0.18371   3.497  0.00137 ** 
## racecblack         1.61874    0.30757   5.263 8.49e-06 ***
## racechispanic      0.70072    0.25699   2.727  0.01017 *  
## racecother        -0.31687    0.26950  -1.176  0.24810    
## age                0.56909    0.04612  12.340 6.54e-14 ***
## sex:racecblack    -1.67622    0.32094  -5.223 9.56e-06 ***
## sex:racechispanic -0.26861    0.27224  -0.987  0.33097    
## sex:racecother    -0.22503    0.33284  -0.676  0.50369    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 24.03406)
## 
## Number of Fisher Scoring iterations: 2

regTermTest(m, 'sex:racec') # Chunk test

## Wald test for sex:racec
##  in svyglm(formula = bmi ~ sex * racec + age, design = .des)
## F =  9.105241  on  3  and  33  df: p= 0.00015543

# Logistic regression -----------------------------------------------------
# *Simple Logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex;
# model ql43 = sex ;
# run;

svytable(~sexc+ql43, Ntotal=100, design=.des)

##          ql43
## sexc             0        1
##   males   32.40422 17.80102
##   females 32.68711 17.10764

m <- svyglm(ql43~sexc, family=quasibinomial, design=.des); 
summary(m)

## 
## Call:
## svyglm(formula = ql43 ~ sexc, family = quasibinomial, design = .des)
## 
## Survey design:
## subset(.des, !is.na(sexc))
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.59903    0.06089  -9.838 3.09e-12 ***
## sexcfemales -0.04842    0.07084  -0.684    0.498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.012319)
## 
## Number of Fisher Scoring iterations: 4

exp(coef(m))

## (Intercept) sexcfemales 
##   0.5493426   0.9527310

# *Multiple logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex grade race;
# model ql43 = sex grade race ;
# run;

m <- svyglm(ql43~sexc+factor(grade)+racec, family=quasibinomial, design=.des); 
exp(coef(m)) %>% round(2)

##    (Intercept)    sexcfemales factor(grade)2 factor(grade)3 factor(grade)4 
##           0.34           0.97           1.39           2.03           2.72 
##     racecblack  racechispanic     racecother 
##           0.75           1.10           0.77

# Relative Risk Regression - use Poisson regression -----------------------------------------

# *if possible, predicted marginals/Risk ratios from logistic regression;
# proc rlogist data=one;
# NEST stratum psu / missunit;
# weight weight;
# class sex grade race;
# model ql43 = sex grade race ;
# PREDMARG race (1) / ADJRR;
# run;


library(epitools)
(.table=svytable(~racec+ql43, .des) %>% round)

##           ql43
## racec         0    1
##   white    4421 2520
##   black    1166  489
##   hispanic 1569  941
##   other     740  319

or <- oddsratio(.table, method='wald')
or; names(or)

## $data
##           ql43
## racec         0    1 Total
##   white    4421 2520  6941
##   black    1166  489  1655
##   hispanic 1569  941  2510
##   other     740  319  1059
##   Total    7896 4269 12165
## 
## $measure
##           odds ratio with 95% C.I.
## racec       estimate     lower     upper
##   white    1.0000000        NA        NA
##   black    0.7357500 0.6549202 0.8265558
##   hispanic 1.0521718 0.9573269 1.1564132
##   other    0.7562736 0.6574101 0.8700045
## 
## $p.value
##           two-sided
## racec        midp.exact fisher.exact   chi.square
##   white              NA           NA           NA
##   black    1.694601e-07 1.763436e-07 2.215733e-07
##   hispanic 2.915234e-01 2.986148e-01 2.913325e-01
##   other    7.664817e-05 8.375555e-05 8.959863e-05
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

## [1] "data"       "measure"    "p.value"    "correction"

oddsratio(.table, method='wald')$measure

##           odds ratio with 95% C.I.
## racec       estimate     lower     upper
##   white    1.0000000        NA        NA
##   black    0.7357500 0.6549202 0.8265558
##   hispanic 1.0521718 0.9573269 1.1564132
##   other    0.7562736 0.6574101 0.8700045

m <- svyglm(ql43~racec, family=quasibinomial, design=.des); exp(coef(m))

##   (Intercept)    racecblack racechispanic    racecother 
##     0.5701295     0.7352843     1.0521088     0.7547253

# m <- svyglm(I(ql43==1)~racec, family=quasibinomial, design=.des); exp(coef(m))

summary(m)

## 
## Call:
## svyglm(formula = ql43 ~ racec, family = quasibinomial, design = .des)
## 
## Survey design:
## subset(.des, !is.na(sexc))
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.56189    0.07058  -7.961 1.28e-09 ***
## racecblack    -0.30750    0.11715  -2.625   0.0124 *  
## racechispanic  0.05080    0.11216   0.453   0.6532    
## racecother    -0.28140    0.11140  -2.526   0.0158 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.013566)
## 
## Number of Fisher Scoring iterations: 4

attributes(m)

## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "na.action"         "call"              "formula"          
## [25] "terms"             "data"              "offset"           
## [28] "control"           "method"            "contrasts"        
## [31] "xlevels"           "naive.cov"         "cov.unscaled"     
## [34] "survey.design"    
## 
## $class
## [1] "svyglm" "glm"    "lm"

# poisson regression 
# http://biostats.bepress.com/uwbiostat/paper293/
# PMID 12746247

.rr <- riskratio(.table, method='wald')$measure; .rr

##           risk ratio with 95% C.I.
## racec       estimate     lower     upper
##   white    1.0000000        NA        NA
##   black    0.8138275 0.7507640 0.8821883
##   hispanic 1.0326126 0.9731071 1.0957568
##   other    0.8296907 0.7530813 0.9140934

m <- svyglm(ql43~racec, family=quasipoisson, design=.des); m

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
## 
## Call:  svyglm(formula = ql43 ~ racec, family = quasipoisson, design = .des)
## 
## Coefficients:
##   (Intercept)     racecblack  racechispanic     racecother  
##      -1.01305       -0.20644        0.03205       -0.18812  
## 
## Degrees of Freedom: 12003 Total (i.e. Null);  38 Residual
##   (1567 observations deleted due to missingness)
## Null Deviance:       8941 
## Residual Deviance: 8911  AIC: NA

exp(coef(m)); confint(m)

##   (Intercept)    racecblack racechispanic    racecother 
##     0.3631099     0.8134764     1.0325713     0.8285142

##                    2.5 %      97.5 %
## (Intercept)   -1.1011556 -0.92494408
## racecblack    -0.3613844 -0.05149239
## racechispanic -0.1063446  0.17044885
## racecother    -0.3364529 -0.03978978

x <- cbind(epitools=.rr[,1], quasipoisson=exp(coef(m))); x

##           epitools quasipoisson
## white    1.0000000    0.3631099
## black    0.8138275    0.8134764
## hispanic 1.0326126    1.0325713
## other    0.8296907    0.8285142

x[1,2] <- NA; x

##           epitools quasipoisson
## white    1.0000000           NA
## black    0.8138275    0.8134764
## hispanic 1.0326126    1.0325713
## other    0.8296907    0.8285142

# I think it estimates the incidence density ratio rather than the relative risk

mOR <- svyglm(ql43~racec + sexc + grade, family=quasibinomial, design=.des); 
mRR <- update(mOR, family =quasipoisson)
mOR

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
## 
## Call:  svyglm(formula = ql43 ~ racec + sexc + grade, family = quasibinomial, 
##     design = .des)
## 
## Coefficients:
##   (Intercept)     racecblack  racechispanic     racecother    sexcfemales  
##      -1.40661       -0.28889        0.09085       -0.26130       -0.02899  
##         grade  
##       0.33820  
## 
## Degrees of Freedom: 11974 Total (i.e. Null);  36 Residual
##   (1596 observations deleted due to missingness)
## Null Deviance:       15730 
## Residual Deviance: 15300     AIC: NA

mRR

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (54) clusters.
## subset(.des, !is.na(sexc))
## 
## Call:  svyglm(formula = ql43 ~ racec + sexc + grade, family = quasipoisson, 
##     design = .des)
## 
## Coefficients:
##   (Intercept)     racecblack  racechispanic     racecother    sexcfemales  
##      -1.56994       -0.18893        0.05626       -0.16975       -0.01819  
##         grade  
##       0.21619  
## 
## Degrees of Freedom: 11974 Total (i.e. Null);  36 Residual
##   (1596 observations deleted due to missingness)
## Null Deviance:       8923 
## Residual Deviance: 8646  AIC: NA

cbind(OR=coef(mOR), RR=coef(mRR)) %>% exp

##                      OR        RR
## (Intercept)   0.2449728 0.2080582
## racecblack    0.7490951 0.8278423
## racechispanic 1.0951047 1.0578743
## racecother    0.7700504 0.8438794
## sexcfemales   0.9714298 0.9819695
## grade         1.4024179 1.2413328

https://gist.github.com/tslumley/2e74cd0ac12a671d2724

Second, run these marginals through the svypredmeans function written by Dr. Thomas Lumley. “For any archaeology fans out there, this function emulates the PREDMARG statement in the ancient language of SUDAAN.”

HELP http://r4stats.com/books/r4sas-spss/R for SAS and SPSS Users 2011

Complex Surveys: A Guide to Analysis Using R (Lumley) http://goo.gl/x255s2

http://r.789695.n4.nabble.com/ http://stackoverflow.com/questions/tagged/r