setwd("C:/Users/maiam/Dropbox/PROFESSIONAL DEVELOPMENT/DATA SCIENCE/01_R/Analyzing Survey Data in R")

ce<-read.csv("ce.csv")
#head(ce)

Introduction to survey data

What are survey weights?

Visualizing the weights

# Load ggplot2
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.4

# Construct a histogram of the weights
ggplot(data = ce, mapping = aes(x = FINLWT21)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Specifying elements of the design in R

Designs in R

#install.packages("ggplot2")
library(ggplot2)
#install.packages("dplyr")
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.4

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#install.packages("survey")
library(survey)

## Warning: package 'survey' was built under R version 3.4.4

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

Designs in R

data(api)

# Look at the apisrs dataset
glimpse(apisrs)

## Observations: 200
## Variables: 39
## $ cds      <chr> "15739081534155", "19642126066716", "30664493030640",...
## $ stype    <fct> H, E, H, E, E, E, M, E, E, E, E, H, M, E, E, E, M, M,...
## $ name     <chr> "McFarland High", "Stowers (Cecil ", "Brea-Olinda Hig...
## $ sname    <chr> "McFarland High", "Stowers (Cecil B.) Elementary", "B...
## $ snum     <dbl> 1039, 1124, 2868, 1273, 4926, 2463, 2031, 1736, 2142,...
## $ dname    <chr> "McFarland Unified", "ABC Unified", "Brea-Olinda Unif...
## $ dnum     <int> 432, 1, 79, 187, 640, 284, 401, 401, 470, 632, 401, 7...
## $ cname    <chr> "Kern", "Los Angeles", "Orange", "Los Angeles", "San ...
## $ cnum     <int> 14, 18, 29, 18, 39, 18, 18, 18, 18, 37, 18, 24, 14, 1...
## $ flag     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest  <int> 98, 100, 98, 99, 99, 93, 98, 99, 100, 90, 95, 100, 97...
## $ api00    <int> 462, 878, 734, 772, 739, 835, 456, 506, 543, 649, 556...
## $ api99    <int> 448, 831, 742, 657, 719, 822, 472, 474, 458, 604, 575...
## $ target   <int> 18, NA, 3, 7, 4, NA, 16, 16, 17, 10, 11, 9, 14, 5, 15...
## $ growth   <int> 14, 47, -8, 115, 20, 13, -16, 32, 85, 45, -19, 51, 4,...
## $ sch.wide <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, No, Ye...
## $ comp.imp <fct> Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Ye...
## $ both     <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Yes...
## $ awards   <fct> No, Yes, No, Yes, Yes, No, No, Yes, Yes, No, No, Yes,...
## $ meals    <int> 44, 8, 10, 70, 43, 16, 81, 98, 94, 85, 81, 67, 77, 20...
## $ ell      <int> 31, 25, 10, 25, 12, 19, 40, 65, 65, 57, 4, 25, 32, 16...
## $ yr.rnd   <fct> NA, NA, NA, NA, NA, NA, NA, No, NA, NA, NA, NA, NA, N...
## $ mobility <int> 6, 15, 7, 23, 12, 13, 22, 43, 15, 10, 20, 12, 4, 32, ...
## $ acs.k3   <int> NA, 19, NA, 23, 20, 19, NA, 18, 19, 16, 16, NA, NA, 1...
## $ acs.46   <int> NA, 30, NA, NA, 29, 29, 30, 29, 32, 25, 27, NA, NA, 2...
## $ acs.core <int> 24, NA, 28, NA, NA, NA, 27, NA, NA, 30, NA, 17, 27, N...
## $ pct.resp <int> 82, 97, 95, 100, 91, 71, 49, 75, 99, 49, 62, 96, 77, ...
## $ not.hsg  <int> 44, 4, 5, 37, 8, 1, 30, 49, 48, 23, 5, 44, 40, 4, 14,...
## $ hsg      <int> 34, 10, 9, 40, 21, 8, 27, 31, 34, 36, 38, 19, 34, 14,...
## $ some.col <int> 12, 23, 21, 14, 27, 20, 18, 15, 14, 14, 29, 17, 16, 2...
## $ col.grad <int> 7, 43, 41, 8, 34, 38, 22, 2, 4, 21, 24, 19, 8, 37, 10...
## $ grad.sch <int> 3, 21, 24, 1, 10, 34, 2, 3, 1, 6, 5, 2, 2, 19, 1, 3, ...
## $ avg.ed   <dbl> 1.91, 3.66, 3.71, 1.96, 3.17, 3.96, 2.39, 1.79, 1.77,...
## $ full     <int> 71, 90, 83, 85, 100, 75, 72, 69, 68, 81, 84, 100, 89,...
## $ emer     <int> 35, 10, 18, 18, 0, 20, 25, 22, 29, 7, 16, 0, 11, 5, 6...
## $ enroll   <int> 477, 478, 1410, 342, 217, 258, 1274, 566, 645, 311, 3...
## $ api.stu  <int> 429, 420, 1287, 291, 189, 211, 1090, 353, 563, 258, 2...
## $ pw       <dbl> 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.9...
## $ fpc      <dbl> 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194,...

# Specify a simple random sampling for apisrs
apisrs_design <- svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)

# Produce a summary of the design
summary(apisrs_design)

## Independent Sampling design
## svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
## Probabilities:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03229 0.03229 0.03229 0.03229 0.03229 0.03229 
## Population size (PSUs): 6194 
## Data variables:
##  [1] "cds"      "stype"    "name"     "sname"    "snum"     "dname"   
##  [7] "dnum"     "cname"    "cnum"     "flag"     "pcttest"  "api00"   
## [13] "api99"    "target"   "growth"   "sch.wide" "comp.imp" "both"    
## [19] "awards"   "meals"    "ell"      "yr.rnd"   "mobility" "acs.k3"  
## [25] "acs.46"   "acs.core" "pct.resp" "not.hsg"  "hsg"      "some.col"
## [31] "col.grad" "grad.sch" "avg.ed"   "full"     "emer"     "enroll"  
## [37] "api.stu"  "pw"       "fpc"

Stratified designs in R

# Glimpse the data
glimpse(apistrat)

## Observations: 200
## Variables: 39
## $ cds      <chr> "19647336097927", "19647336016018", "19648816021505",...
## $ stype    <fct> E, E, E, E, E, E, E, E, E, E, M, M, H, M, H, E, E, M,...
## $ name     <chr> "Open Magnet: Ce", "Belvedere Eleme", "Altadena Eleme...
## $ sname    <chr> "Open Magnet: Center for Individual (Char", "Belveder...
## $ snum     <dbl> 2077, 1622, 2236, 1921, 6140, 6077, 6071, 904, 4637, ...
## $ dname    <chr> "Los Angeles Unified", "Los Angeles Unified", "Pasade...
## $ dnum     <int> 401, 401, 541, 401, 460, 689, 689, 41, 702, 135, 590,...
## $ cname    <chr> "Los Angeles", "Los Angeles", "Los Angeles", "Los Ang...
## $ cnum     <int> 18, 18, 18, 18, 55, 55, 55, 14, 36, 36, 35, 32, 9, 1,...
## $ flag     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest  <int> 99, 100, 99, 100, 100, 100, 99, 98, 100, 100, 99, 99,...
## $ api00    <int> 840, 516, 531, 501, 720, 805, 778, 731, 592, 669, 496...
## $ api99    <int> 816, 476, 544, 457, 659, 780, 787, 731, 508, 658, 479...
## $ target   <int> NA, 16, 13, 17, 7, 1, 1, 3, 15, 7, 16, 15, 17, 20, 13...
## $ growth   <int> 24, 40, -13, 44, 61, 25, -9, 0, 84, 11, 17, 6, 7, 3, ...
## $ sch.wide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, N...
## $ comp.imp <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ both     <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ awards   <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ meals    <int> 33, 98, 64, 83, 26, 7, 9, 45, 75, 47, 69, 60, 66, 54,...
## $ ell      <int> 25, 77, 23, 63, 17, 0, 2, 2, 58, 23, 25, 10, 43, 26, ...
## $ yr.rnd   <fct> No, Yes, No, No, No, No, No, No, Yes, No, No, No, No,...
## $ mobility <int> 11, 26, 17, 13, 31, 12, 10, 15, 23, 19, 26, 22, 16, 4...
## $ acs.k3   <int> 20, 19, 20, 17, 20, 19, 19, 19, 20, 18, NA, NA, NA, N...
## $ acs.46   <int> 29, 28, 30, 30, 30, 29, 31, 31, 32, 29, 32, 32, NA, 3...
## $ acs.core <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 30, 32, 27, 2...
## $ pct.resp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87, 67, 50, 70, 71, ...
## $ not.hsg  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 49, 12, 20, 45, ...
## $ hsg      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34, 20, 33, 20, 36, ...
## $ some.col <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 15, 23, 31, 11, ...
## $ col.grad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 29, 23, 8, 9...
## $ grad.sch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 3, 6, 0, 0, 11...
## $ avg.ed   <dbl> 3.32, 1.67, 2.34, 1.86, 3.17, 3.64, 3.55, 3.10, 2.17,...
## $ full     <int> 100, 57, 81, 64, 90, 95, 96, 93, 91, 96, 84, 65, 93, ...
## $ emer     <int> 0, 40, 26, 24, 7, 0, 0, 8, 14, 0, 18, 37, 17, 26, 19,...
## $ enroll   <int> 276, 841, 441, 298, 354, 330, 385, 583, 763, 381, 129...
## $ api.stu  <int> 241, 631, 415, 288, 319, 315, 363, 510, 652, 322, 103...
## $ pw       <dbl> 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.2...
## $ fpc      <dbl> 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421,...

# Summarize strata sample sizes
apistrat %>%
  count(stype)

## # A tibble: 3 x 2
##   stype     n
##   <fct> <int>
## 1 E       100
## 2 H        50
## 3 M        50

# Specify the design
apistrat_design <- svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1, strata = ~stype)

# Look at the summary information stored in the design object
summary(apistrat_design)

## Stratified Independent Sampling design
## svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1, 
##     strata = ~stype)
## Probabilities:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02262 0.02262 0.03587 0.04014 0.05339 0.06623 
## Stratum Sizes: 
##              E  H  M
## obs        100 50 50
## design.PSU 100 50 50
## actual.PSU 100 50 50
## Population stratum sizes (PSUs): 
##    E    H    M 
## 4421  755 1018 
## Data variables:
##  [1] "cds"      "stype"    "name"     "sname"    "snum"     "dname"   
##  [7] "dnum"     "cname"    "cnum"     "flag"     "pcttest"  "api00"   
## [13] "api99"    "target"   "growth"   "sch.wide" "comp.imp" "both"    
## [19] "awards"   "meals"    "ell"      "yr.rnd"   "mobility" "acs.k3"  
## [25] "acs.46"   "acs.core" "pct.resp" "not.hsg"  "hsg"      "some.col"
## [31] "col.grad" "grad.sch" "avg.ed"   "full"     "emer"     "enroll"  
## [37] "api.stu"  "pw"       "fpc"

Cluster designs in R

# Glimpse the data
glimpse(apiclus2)

## Observations: 126
## Variables: 40
## $ cds      <chr> "31667796031017", "55751846054837", "41688746043517",...
## $ stype    <fct> E, E, E, M, E, E, E, E, M, H, E, M, E, E, E, E, H, E,...
## $ name     <chr> "Alta-Dutch Flat", "Tenaya Elementa", "Panorama Eleme...
## $ sname    <chr> "Alta-Dutch Flat Elementary", "Tenaya Elementary", "P...
## $ snum     <dbl> 3269, 5979, 4958, 4957, 4956, 4915, 2548, 2550, 2549,...
## $ dname    <chr> "Alta-Dutch Flat Elem", "Big Oak Flat-Grvlnd Unif", "...
## $ dnum     <int> 15, 63, 83, 83, 83, 117, 132, 132, 132, 152, 152, 152...
## $ cname    <chr> "Placer", "Tuolumne", "San Mateo", "San Mateo", "San ...
## $ cnum     <int> 30, 54, 40, 40, 40, 39, 19, 19, 19, 5, 5, 5, 36, 36, ...
## $ flag     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest  <int> 100, 100, 98, 100, 98, 100, 100, 100, 100, 96, 98, 10...
## $ api00    <int> 821, 773, 600, 740, 716, 811, 472, 520, 568, 591, 544...
## $ api99    <int> 785, 718, 632, 740, 711, 779, 432, 494, 589, 585, 554...
## $ target   <int> 1, 4, 8, 3, 4, 1, 18, 15, 11, 11, 12, 11, NA, NA, NA,...
## $ growth   <int> 36, 55, -32, 0, 5, 32, 40, 26, -21, 6, -10, 29, 14, 2...
## $ sch.wide <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ comp.imp <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ both     <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ awards   <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ meals    <int> 27, 43, 33, 11, 5, 25, 78, 76, 68, 42, 63, 54, 0, 4, ...
## $ ell      <int> 0, 0, 5, 4, 2, 5, 38, 34, 34, 23, 42, 24, 3, 6, 2, 1,...
## $ yr.rnd   <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ mobility <int> 14, 12, 9, 8, 6, 19, 13, 13, 15, 4, 15, 15, 24, 19, 1...
## $ acs.k3   <int> 17, 18, 19, NA, 18, 20, 19, 25, NA, NA, 20, NA, 19, 1...
## $ acs.46   <int> 20, 34, 29, 30, 28, 22, NA, 23, 24, NA, NA, 27, 27, 2...
## $ acs.core <int> NA, NA, NA, 24, NA, 31, NA, NA, 25, 21, NA, 18, NA, N...
## $ pct.resp <int> 89, 98, 79, 96, 98, 93, 100, 46, 91, 94, 93, 88, 90, ...
## $ not.hsg  <int> 4, 8, 8, 5, 3, 5, 48, 30, 63, 20, 29, 27, 0, 1, 0, 1,...
## $ hsg      <int> 16, 33, 28, 27, 14, 9, 32, 27, 16, 18, 32, 25, 0, 7, ...
## $ some.col <int> 53, 37, 30, 35, 22, 30, 15, 21, 13, 27, 26, 24, 4, 8,...
## $ col.grad <int> 21, 15, 32, 27, 58, 37, 4, 13, 6, 28, 7, 18, 51, 42, ...
## $ grad.sch <int> 6, 7, 1, 6, 3, 19, 1, 9, 2, 7, 6, 7, 44, 41, 100, 45,...
## $ avg.ed   <dbl> 3.07, 2.79, 2.90, 3.03, 3.44, 3.56, 1.77, 2.42, 1.68,...
## $ full     <int> 100, 100, 100, 82, 100, 94, 96, 86, 75, 100, 100, 97,...
## $ emer     <int> 0, 0, 0, 18, 8, 6, 8, 24, 21, 4, 4, 3, 0, 4, 0, 4, 28...
## $ enroll   <int> 152, 312, 173, 201, 147, 234, 184, 512, 543, 332, 217...
## $ api.stu  <int> 120, 270, 151, 179, 136, 189, 158, 419, 423, 303, 182...
## $ pw       <dbl> 18.925, 18.925, 18.925, 18.925, 18.925, 18.925, 18.92...
## $ fpc1     <dbl> 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757...
## $ fpc2     <int> <array[25]>

# Specify the design
apiclus_design <- svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw, fpc = ~fpc1 + fpc2)

#Look at the summary information stored in the design object
summary(apiclus_design)

## 2 - level Cluster Sampling design
## With (40, 126) clusters.
## svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw, 
##     fpc = ~fpc1 + fpc2)
## Probabilities:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.003669 0.037743 0.052840 0.042390 0.052840 0.052840 
## Population size (PSUs): 757 
## Data variables:
##  [1] "cds"      "stype"    "name"     "sname"    "snum"     "dname"   
##  [7] "dnum"     "cname"    "cnum"     "flag"     "pcttest"  "api00"   
## [13] "api99"    "target"   "growth"   "sch.wide" "comp.imp" "both"    
## [19] "awards"   "meals"    "ell"      "yr.rnd"   "mobility" "acs.k3"  
## [25] "acs.46"   "acs.core" "pct.resp" "not.hsg"  "hsg"      "some.col"
## [31] "col.grad" "grad.sch" "avg.ed"   "full"     "emer"     "enroll"  
## [37] "api.stu"  "pw"       "fpc1"     "fpc2"

Comparing survey weights of different designs

data(api)
# Construct histogram of pw, for the simple random sample, apisrs
ggplot(data = apisrs,
       mapping = aes(x = pw)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Construct histogram of pw, for the stratified sample, apistrat.
ggplot(data = apistrat,
       mapping = aes(x = pw)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Construct histogram of pw, for the cluster sample
ggplot(data = apiclus2,
       mapping = aes(x = pw)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualizing the impact of survey weights

NHANES weights

library(NHANES)

## Warning: package 'NHANES' was built under R version 3.4.4

data(NHANES)
#Create table of average survey weights by race
tab_weights <- NHANESraw %>%
  group_by(Race1) %>%
  summarize(avg_wt = mean(WTMEC2YR))

#Print the table
print(tab_weights)

## # A tibble: 5 x 2
##   Race1    avg_wt
##   <fct>     <dbl>
## 1 Black    16052.
## 2 Hispanic 17158.
## 3 Mexican  16432.
## 4 White    52473.
## 5 Other    20233.

Exploring categorical data

Visualizing a categorical variable

Summarizing a categorical variable

# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)

# Print summary of design
print(summary(NHANES_design))

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, 
##     nest = TRUE, weights = ~WTMEC2YR)
## Probabilities:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 4.493e-06 2.832e-05 5.271e-05       Inf 8.605e-05       Inf 
## Stratum Sizes: 
##             75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
## obs        803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862
## design.PSU   2   2   2   2   2   2   2   2   2   2   2   3   2   2   2   3
## actual.PSU   2   2   2   2   2   2   2   2   2   2   2   3   2   2   2   3
##             91  92  93  94  95  96  97  98  99 100 101 102 103
## obs        998 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU   3   3   2   2   2   2   2   2   2   2   2   2   2
## actual.PSU   3   3   2   2   2   2   2   2   2   2   2   2   2
## Data variables:
##  [1] "ID"               "SurveyYr"         "Gender"          
##  [4] "Age"              "AgeMonths"        "Race1"           
##  [7] "Race3"            "Education"        "MaritalStatus"   
## [10] "HHIncome"         "HHIncomeMid"      "Poverty"         
## [13] "HomeRooms"        "HomeOwn"          "Work"            
## [16] "Weight"           "Length"           "HeadCirc"        
## [19] "Height"           "BMI"              "BMICatUnder20yrs"
## [22] "BMI_WHO"          "Pulse"            "BPSysAve"        
## [25] "BPDiaAve"         "BPSys1"           "BPDia1"          
## [28] "BPSys2"           "BPDia2"           "BPSys3"          
## [31] "BPDia3"           "Testosterone"     "DirectChol"      
## [34] "TotChol"          "UrineVol1"        "UrineFlow1"      
## [37] "UrineVol2"        "UrineFlow2"       "Diabetes"        
## [40] "DiabetesAge"      "HealthGen"        "DaysPhysHlthBad" 
## [43] "DaysMentHlthBad"  "LittleInterest"   "Depressed"       
## [46] "nPregnancies"     "nBabies"          "Age1stBaby"      
## [49] "SleepHrsNight"    "SleepTrouble"     "PhysActive"      
## [52] "PhysActiveDays"   "TVHrsDay"         "CompHrsDay"      
## [55] "TVHrsDayChild"    "CompHrsDayChild"  "Alcohol12PlusYr" 
## [58] "AlcoholDay"       "AlcoholYear"      "SmokeNow"        
## [61] "Smoke100"         "SmokeAge"         "Marijuana"       
## [64] "AgeFirstMarij"    "RegularMarij"     "AgeRegMarij"     
## [67] "HardDrugs"        "SexEver"          "SexAge"          
## [70] "SexNumPartnLife"  "SexNumPartYear"   "SameSex"         
## [73] "SexOrientation"   "WTINT2YR"         "WTMEC2YR"        
## [76] "SDMVPSU"          "SDMVSTRA"         "PregnantNow"

Inference for quantitative data

Tying it all together!

# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)

# Print summary of design
summary(NHANES_design)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, 
##     nest = TRUE, weights = ~WTMEC2YR)
## Probabilities:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 4.493e-06 2.832e-05 5.271e-05       Inf 8.605e-05       Inf 
## Stratum Sizes: 
##             75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
## obs        803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862
## design.PSU   2   2   2   2   2   2   2   2   2   2   2   3   2   2   2   3
## actual.PSU   2   2   2   2   2   2   2   2   2   2   2   3   2   2   2   3
##             91  92  93  94  95  96  97  98  99 100 101 102 103
## obs        998 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU   3   3   2   2   2   2   2   2   2   2   2   2   2
## actual.PSU   3   3   2   2   2   2   2   2   2   2   2   2   2
## Data variables:
##  [1] "ID"               "SurveyYr"         "Gender"          
##  [4] "Age"              "AgeMonths"        "Race1"           
##  [7] "Race3"            "Education"        "MaritalStatus"   
## [10] "HHIncome"         "HHIncomeMid"      "Poverty"         
## [13] "HomeRooms"        "HomeOwn"          "Work"            
## [16] "Weight"           "Length"           "HeadCirc"        
## [19] "Height"           "BMI"              "BMICatUnder20yrs"
## [22] "BMI_WHO"          "Pulse"            "BPSysAve"        
## [25] "BPDiaAve"         "BPSys1"           "BPDia1"          
## [28] "BPSys2"           "BPDia2"           "BPSys3"          
## [31] "BPDia3"           "Testosterone"     "DirectChol"      
## [34] "TotChol"          "UrineVol1"        "UrineFlow1"      
## [37] "UrineVol2"        "UrineFlow2"       "Diabetes"        
## [40] "DiabetesAge"      "HealthGen"        "DaysPhysHlthBad" 
## [43] "DaysMentHlthBad"  "LittleInterest"   "Depressed"       
## [46] "nPregnancies"     "nBabies"          "Age1stBaby"      
## [49] "SleepHrsNight"    "SleepTrouble"     "PhysActive"      
## [52] "PhysActiveDays"   "TVHrsDay"         "CompHrsDay"      
## [55] "TVHrsDayChild"    "CompHrsDayChild"  "Alcohol12PlusYr" 
## [58] "AlcoholDay"       "AlcoholYear"      "SmokeNow"        
## [61] "Smoke100"         "SmokeAge"         "Marijuana"       
## [64] "AgeFirstMarij"    "RegularMarij"     "AgeRegMarij"     
## [67] "HardDrugs"        "SexEver"          "SexAge"          
## [70] "SexNumPartnLife"  "SexNumPartYear"   "SameSex"         
## [73] "SexOrientation"   "WTINT2YR"         "WTMEC2YR"        
## [76] "SDMVPSU"          "SDMVSTRA"         "PregnantNow"

# Number of clusters
NHANESraw %>%
  summarize(n_clusters = n_distinct(SDMVSTRA, SDMVPSU))

## # A tibble: 1 x 1
##   n_clusters
##        <int>
## 1         62

# Sample sizes in clusters
NHANESraw %>%
  count(SDMVSTRA, SDMVPSU)

## # A tibble: 62 x 3
##    SDMVSTRA SDMVPSU     n
##       <int>   <int> <int>
##  1       75       1   379
##  2       75       2   424
##  3       76       1   419
##  4       76       2   366
##  5       77       1   441
##  6       77       2   382
##  7       78       1   378
##  8       78       2   451
##  9       79       1   349
## 10       79       2   347
## # ... with 52 more rows

Summarizing a categorical variable

# Specify the survey design
NHANESraw <- mutate(NHANESraw, WTMEC4YR = .5 * WTMEC2YR)
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)

# Determine the levels of Depressed
levels(NHANESraw$Depressed)

## [1] "None"    "Several" "Most"

# Construct a frequency table of Depressed
tab_w <- svytable(~Depressed, design = NHANES_design)

# Determine class of tab_w
class(tab_w)

## [1] "svytable" "xtabs"    "table"

# Display tab_w
tab_w

## Depressed
##      None   Several      Most 
## 317517218  65465017  25408883

Interpreting frequency tables

Graphing a categorical variable

# Add proportions to table
tab_w <- tab_w %>%
  as.data.frame() %>%
  mutate(Prop = Freq/sum(Freq))

# Create a barplot
ggplot(data = tab_w,
       mapping = aes(x = Depressed, y = Prop)) + 
  geom_col()

##Exploring two categorical variables

Creating contingency tables

# Construct and display a frequency table
tab_D <- svytable(~Depressed,
           design = NHANES_design)
tab_D

## Depressed
##      None   Several      Most 
## 317517218  65465017  25408883

# Construct and display a frequency table
tab_H <- svytable(~HealthGen,
           design = NHANES_design)
tab_H

## HealthGen
## Excellent     Vgood      Good      Fair      Poor 
##  55319909 154964339 174995171  63088061  11336968

# Construct and display a frequency table
tab_DH <- svytable(~Depressed+HealthGen,
           design = NHANES_design)
tab_DH

##          HealthGen
## Depressed Excellent     Vgood      Good      Fair      Poor
##   None     42654363 114974637 119840064  35381566   4649890
##   Several   3741242  16604989  27900937  14710210   2507639
##   Most      1127227   3711730   9397896   7871011   3301019

Building segments bar graphs

# Add conditional proportions to tab_DH
tab_DH_cond <- tab_DH %>%
    as.data.frame() %>%
    group_by(HealthGen) %>%
    mutate(n_HealthGen = sum(Freq), Prop_Depressed = Freq/n_HealthGen) %>%
    ungroup()
# Print tab_DH_cond
tab_DH_cond

## # A tibble: 15 x 5
##    Depressed HealthGen       Freq n_HealthGen Prop_Depressed
##    <fct>     <fct>          <dbl>       <dbl>          <dbl>
##  1 None      Excellent  42654363.   47522832.         0.898 
##  2 Several   Excellent   3741242.   47522832.         0.0787
##  3 Most      Excellent   1127227.   47522832.         0.0237
##  4 None      Vgood     114974637.  135291355.         0.850 
##  5 Several   Vgood      16604989.  135291355.         0.123 
##  6 Most      Vgood       3711730.  135291355.         0.0274
##  7 None      Good      119840064.  157138897.         0.763 
##  8 Several   Good       27900937.  157138897.         0.178 
##  9 Most      Good        9397896.  157138897.         0.0598
## 10 None      Fair       35381566.   57962787.         0.610 
## 11 Several   Fair       14710210.   57962787.         0.254 
## 12 Most      Fair        7871011.   57962787.         0.136 
## 13 None      Poor        4649890.   10458548.         0.445 
## 14 Several   Poor        2507639.   10458548.         0.240 
## 15 Most      Poor        3301019.   10458548.         0.316

# Create a segmented bar graph of the conditional proportions in tab_DH_cond
ggplot(data = tab_DH_cond,
       mapping = aes(x = HealthGen, y = Prop_Depressed, fill = Depressed)) + 
  geom_col() + 
  coord_flip()

###Summarizing with svytotal()

# Estimate the totals for combos of Depressed and HealthGen
tab_totals <- svytotal(x = ~interaction(Depressed, HealthGen),
                     design = NHANES_design,
                     na.rm = TRUE)

# Print table of totals
print(tab_totals)

##                                                        total      SE
## interaction(Depressed, HealthGen)None.Excellent     42654363 3112536
## interaction(Depressed, HealthGen)Several.Excellent   3741242  554395
## interaction(Depressed, HealthGen)Most.Excellent      1127227  279379
## interaction(Depressed, HealthGen)None.Vgood        114974637 5951612
## interaction(Depressed, HealthGen)Several.Vgood      16604989 1374040
## interaction(Depressed, HealthGen)Most.Vgood          3711730  539941
## interaction(Depressed, HealthGen)None.Good         119840064 6750136
## interaction(Depressed, HealthGen)Several.Good       27900937 1862155
## interaction(Depressed, HealthGen)Most.Good           9397896 1002210
## interaction(Depressed, HealthGen)None.Fair          35381566 2412614
## interaction(Depressed, HealthGen)Several.Fair       14710210  910727
## interaction(Depressed, HealthGen)Most.Fair           7871011  740513
## interaction(Depressed, HealthGen)None.Poor           4649890  503867
## interaction(Depressed, HealthGen)Several.Poor        2507639  336880
## interaction(Depressed, HealthGen)Most.Poor           3301019  390272

# Estimate the means for combos of Depressed and HealthGen
tab_means <- svymean(x = ~interaction(Depressed, HealthGen),
              design = NHANES_design,
              na.rm = TRUE)

# Print table of means
tab_means

##                                                         mean     SE
## interaction(Depressed, HealthGen)None.Excellent    0.1044492 0.0053
## interaction(Depressed, HealthGen)Several.Excellent 0.0091613 0.0014
## interaction(Depressed, HealthGen)Most.Excellent    0.0027603 0.0007
## interaction(Depressed, HealthGen)None.Vgood        0.2815422 0.0078
## interaction(Depressed, HealthGen)Several.Vgood     0.0406612 0.0028
## interaction(Depressed, HealthGen)Most.Vgood        0.0090890 0.0013
## interaction(Depressed, HealthGen)None.Good         0.2934563 0.0092
## interaction(Depressed, HealthGen)Several.Good      0.0683220 0.0033
## interaction(Depressed, HealthGen)Most.Good         0.0230129 0.0023
## interaction(Depressed, HealthGen)None.Fair         0.0866400 0.0047
## interaction(Depressed, HealthGen)Several.Fair      0.0360214 0.0026
## interaction(Depressed, HealthGen)Most.Fair         0.0192740 0.0019
## interaction(Depressed, HealthGen)None.Poor         0.0113863 0.0013
## interaction(Depressed, HealthGen)Several.Poor      0.0061405 0.0009
## interaction(Depressed, HealthGen)Most.Poor         0.0080833 0.0010

Inference for categorical variables

Running a chi squared test

# Run a chi square test between Depressed and HealthGen
svychisq(~Depressed+HealthGen, 
    design = NHANES_design, 
    statistic = "Chisq")

## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
## X-squared = 1592.7, df = 8, p-value < 2.2e-16

Tying it all together!

# Construct a contingency table
tab <-svytable(~HomeOwn+Education, design = NHANES_design) 

# Add conditional proportion of levels of HomeOwn for each educational level
tab_df <- as.data.frame(tab) %>%
  group_by(Education) %>%
  mutate(n_Education = sum(Freq), Prop_HomeOwn = Freq/n_Education) %>%
  ungroup()

# Create a segmented bar graph
ggplot(data = tab_df,
       mapping = aes(x = Education, y = Prop_HomeOwn, fill = HomeOwn)) + 
  geom_col() + 
  coord_flip()

# Run a chi square test
svychisq(~HomeOwn+Education, 
    design = NHANES_design, 
    statistic = "Chisq")

## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~HomeOwn + Education, design = NHANES_design, statistic = "Chisq")
## X-squared = 531.78, df = 8, p-value = 2.669e-16

Exploring quantitative data

Summarizing quantitative data

Survey statistics

# Compute the survey-weighted mean
svymean(x = ~SleepHrsNight, 
        design = NHANES_design,
        na.rm = TRUE)

##                 mean     SE
## SleepHrsNight 6.9292 0.0166

# Compute the survey-weighted mean by Gender
svyby(formula = ~SleepHrsNight, 
      by = ~Gender, 
      design = NHANES_design, 
      FUN = svymean, 
      na.rm = TRUE, 
      keep.names = FALSE)

##   Gender SleepHrsNight         se
## 1 female      6.976103 0.02374684
## 2   male      6.879050 0.01953263

Estimating quantiles

# Compute the survey-weighted quantiles
svyquantile(x = ~SleepHrsNight, 
            design = NHANES_design, 
            na.rm = TRUE, 
            quantiles = c(0.01, 0.25, 0.5, 0.75,0.99))

##               0.01 0.25 0.5 0.75 0.99
## SleepHrsNight    4    6   7    8   10

# Compute the survey-weighted quantiles by Gender
svyby(formula = ~SleepHrsNight, 
      by = ~Gender, 
      design = NHANES_design, 
      FUN = svyquantile, 
      na.rm = TRUE, 
      quantiles = 0.5, 
      keep.rows = FALSE, 
      keep.var = FALSE)

##        Gender statistic
## female female         7
## male     male         7

Visualizing quantitative data

Bar plots of survey-weighted means

# Compute the survey-weighted mean by Gender
out <- svyby(formula = ~SleepHrsNight, 
             by = ~Gender, 
             design = NHANES_design, 
             FUN = svymean, 
             na.rm = TRUE, 
             keep.names = FALSE)
             
# Construct a bar plot of average sleep by gender
ggplot(data =out , mapping = aes(x=Gender, y=SleepHrsNight)) +
  geom_col() + 
  labs(y="Average Nightly Sleep")

out

##   Gender SleepHrsNight         se
## 1 female      6.976103 0.02374684
## 2   male      6.879050 0.01953263

select(NHANESraw, SleepHrsNight, Gender)

## # A tibble: 20,293 x 2
##    SleepHrsNight Gender
##            <int> <fct> 
##  1             4 male  
##  2            NA male  
##  3             8 male  
##  4            NA male  
##  5             4 female
##  6             4 male  
##  7             8 female
##  8            NA female
##  9            NA male  
## 10             6 male  
## # ... with 20,283 more rows

The most common response among surveyed males was 6.879 hours of sleep. ###Bar plots with error

# Add lower and upper columns to out
out_col <- mutate(out, 
                  lower = SleepHrsNight - 2*se, 
                  upper = SleepHrsNight + 2*se)

# Construct a bar plot of average sleep by gender with error bars
ggplot(data = out_col, 
       mapping = aes(x = Gender, y = SleepHrsNight, 
                     ymin = lower, ymax = upper)) +
  geom_col(fill = "gold") +
  labs(y = "Average Nightly Sleep") +
  geom_errorbar(width = 0.7)

Survey-weighted histograms

# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
       mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) + 
  geom_histogram(binwidth = 1,
                 color = "white") +
  labs(x = "Hours of Sleep")

## Warning: Removed 7261 rows containing non-finite values (stat_bin).

# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
       mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) + 
  geom_histogram(binwidth = 0.5,
                 color = "white") +
  labs(x = "Hours of Sleep")

## Warning: Removed 7261 rows containing non-finite values (stat_bin).

# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
       mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) + 
  geom_histogram(binwidth = 2,
                 color = "white") +
  labs(x = "Hours of Sleep")

## Warning: Removed 7261 rows containing non-finite values (stat_bin).

Survey-weighted density plots

# Density plot of sleep faceted by gender
NHANESraw %>%
    filter(!is.na(SleepHrsNight), !is.na(Gender)) %>%
    group_by(Gender) %>%
    mutate(WTMEC4YR_std = WTMEC4YR/sum(WTMEC4YR)) %>%
    ggplot(mapping = aes(x = SleepHrsNight, weight = WTMEC4YR_std)) + 
        geom_density(bw = 0.6,  fill = "gold") +
        labs(x = "Hours of Sleep") + 
        facet_wrap(~Gender, labeller = "label_both")

##Inference for quantitative data ###Survey-weighted t-test

# Run a survey-weighted t-test
svyttest(formula = SleepHrsNight~Gender,
       design = NHANES_design)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(glm.object): observations with zero weight not used
## for calculating dispersion

## 
##  Design-based t-test
## 
## data:  SleepHrsNight ~ Gender
## t = -3.4077, df = 32, p-value = 0.001785
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
##  -0.15287218 -0.04123256
## sample estimates:
## difference in mean 
##        -0.09705237

Tying it all together!

# Find means of total cholesterol by whether or not active 
out <- svyby(formula =~TotChol,
           by = ~PhysActive, 
           design = NHANES_design,
           FUN = svymean, 
           na.rm = TRUE, 
           keep.names = FALSE)

# Construct a bar plot of means of total cholesterol by whether or not active 
ggplot(data = out,
       mapping = aes(x=PhysActive, y=TotChol)) +
  geom_col()

Modeling quantitative data

Visualization with scatter plots

Bubble plots

# Create dataset with only 20 year olds
NHANES20 <- filter(NHANESraw,
                Age == 20)

# Construct scatter plot
ggplot(data = NHANES20, 
       mapping = aes(x=Height, y=Weight)) + 
    geom_point(alpha = 0.3) + 
    guides(size = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

# Construct bubble plot
ggplot(data = NHANES20, 
       mapping = aes(x=Height, y=Weight, size=WTMEC4YR)) + 
    geom_point(alpha = 0.3) + 
    guides(size = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

Survey-weighted scatter plots

# Using the dataset, NHANES20, create a scatter plot where Height is mapped to #x, Weight is mapped to y, and WTMEC4YR to color. Don't include a legend.
# Construct a scatter plot
ggplot(data = NHANES20,
       mapping = aes(x= Height, y=Weight, color=WTMEC4YR)) + 
    geom_point() + 
    guides(color = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

# Using the dataset, NHANES20, create a scatter plot where Height is mapped to #x, Weight is mapped to y, and WTMEC4YR to alpha. Don't include a legend.
# Construct a scatter plot
ggplot(data = NHANES20,
       mapping = aes(x= Height, y=Weight, alpha=WTMEC4YR)) + 
    geom_point() + 
    guides(alpha = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

# Create a bubble plot of NHANES20. Map Height to x, Weight to y, WTMEC4YR to size, and Gender to color. Set alpha to 0.3 and include a legend for color but not size.

# Add gender to plot
ggplot(data = NHANES20,
       mapping = aes(x=Height, y=Weight, size=WTMEC4YR, color=Gender)) + 
    geom_point(alpha=0.3) + 
    guides(size = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

# Create a scatter plot of NHANES20. Map Height to x, Weight to y, WTMEC4YR to alpha, and Gender to color. Include a legend for color but not for alpha.

# Add gender to plot
ggplot(data = NHANES20,
       mapping = aes(x=Height, y=Weight,alpha=WTMEC4YR, color=Gender)) + 
    geom_point() + 
    guides(alpha = FALSE)

## Warning: Removed 4 rows containing missing values (geom_point).

##Visualizing trends

Line of best fit

What’s the relationship between height and weight across all ages?

# Bubble plot with linear of best fit
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) + 
  geom_point(alpha = 0.1) + 
  guides(size = FALSE) + 
  geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))

## Warning: Removed 2279 rows containing non-finite values (stat_smooth).

## Warning: Removed 2279 rows containing missing values (geom_point).

#Create a bubble plot of Height (as x) and Weight (as y) in NHANESraw. Add a line of best fit using geom_smooth() with method = "lm" and mapping WTMEC4YR to weight
# Add quadratic curve and cubic curve
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) + 
  geom_point(alpha = 0.1) + 
  guides(size = FALSE) + 
  geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR)) +
  geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 2), color = "orange") +
  geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 3), color = "red")

## Warning: Removed 2279 rows containing non-finite values (stat_smooth).

## Warning: Removed 2279 rows containing non-finite values (stat_smooth).

## Warning: Removed 2279 rows containing non-finite values (stat_smooth).

## Warning: Removed 2279 rows containing missing values (geom_point).

Nice trend curves! Notice that even with the non-linear curves, we are missing the trend for certain heights. This signifies that it is difficult to specify a global trend for these variables.

Trend lines

#To the given bubble plot, add trend lines with linetype = 2 so that the lines are dashed.
# Add non-survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) + 
  geom_point(alpha = 0.1) + 
  guides(size = FALSE) + 
  geom_smooth(method = "lm", se = FALSE, linetype = 2)

## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 4 rows containing missing values (geom_point).

#Now add the survey-weighted trend lines by mapping WTMEC4YR to weight.
# Add non-survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) + 
  geom_point(alpha = 0.1) + 
  guides(size = FALSE) + 
  geom_smooth(method = "lm", se = FALSE, linetype = 2)

## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 4 rows containing missing values (geom_point).

# Add survey-weighted trend lines
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) + 
  geom_point(alpha = 0.1) + 
  guides(size = FALSE) + 
  geom_smooth(method = "lm", se = FALSE, linetype = 2) + 
  geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))

## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 4 rows containing missing values (geom_point).

##Modeling survey data

Regression model

# Subset survey design object to only include 20 year olds
NHANES20_design <- subset(NHANES_design, Age == 20)

# Build a linear regression model
mod <- svyglm(Weight ~ Height, design = NHANES20_design)

# Print summary of the model
summary(mod)

## 
## Call:
## svyglm(formula = Weight ~ Height, design = NHANES20_design)
## 
## Survey design:
## subset(NHANES_design, Age == 20)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -67.2571    22.9836  -2.926  0.00674 ** 
## Height        0.8305     0.1368   6.072 1.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 326.6108)
## 
## Number of Fisher Scoring iterations: 2

More complex modeling

Multiple linear regression

# Build a linear regression model same slope
mod1 <- svyglm(Weight ~ Height+Gender, design = NHANES20_design)

# Print summary of the same slope model
summary(mod1)

## 
## Call:
## svyglm(formula = Weight ~ Height + Gender, design = NHANES20_design)
## 
## Survey design:
## subset(NHANES_design, Age == 20)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -53.8665    22.7622  -2.366   0.0254 *  
## Height        0.7434     0.1391   5.346  1.2e-05 ***
## Gendermale    2.7207     3.2471   0.838   0.4095    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 325.3881)
## 
## Number of Fisher Scoring iterations: 2

Multiple linear regression

# Build a linear regression model different slopes
mod2 <- svyglm(Weight ~ Height*Gender, design = NHANES20_design)

# Print summary of the different slopes model
summary(mod2)

## 
## Call:
## svyglm(formula = Weight ~ Height * Gender, design = NHANES20_design)
## 
## Survey design:
## subset(NHANES_design, Age == 20)
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          9.5061    21.5357   0.441  0.66257   
## Height               0.3565     0.1269   2.809  0.00932 **
## Gendermale        -131.0884    41.9989  -3.121  0.00438 **
## Height:Gendermale    0.7897     0.2385   3.311  0.00273 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 316.5007)
## 
## Number of Fisher Scoring iterations: 2

# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x=BPDiaAve, y=BPSysAve, size=WTMEC4YR, color=Diabetes)) + 
    geom_point(alpha = 0.2) +  
    guides(size = FALSE) + 
    geom_smooth(method="lm", se = FALSE, mapping = aes(weight=WTMEC4YR))

## Warning: Removed 4600 rows containing non-finite values (stat_smooth).

## Warning: Removed 4600 rows containing missing values (geom_point).

# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)

Tying it all together

# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x = BPDiaAve, y = BPSysAve, size = WTMEC4YR, color = Diabetes)) + 
    geom_point(alpha = 0.2) + 
    guides(size = FALSE) + 
    geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR))

## Warning: Removed 4600 rows containing non-finite values (stat_smooth).

## Warning: Removed 4600 rows containing missing values (geom_point).

# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)

# Build model with different slopes
mod2 <- svyglm(BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)

# Summarize models
summary(mod1)

## 
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve, design = NHANES_design)
## 
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, 
##     nest = TRUE, weights = ~WTMEC2YR)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 85.74311    1.86920   45.87   <2e-16 ***
## BPDiaAve     0.48150    0.02354   20.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 290.3472)
## 
## Number of Fisher Scoring iterations: 2

summary(mod2)

## 
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)
## 
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, 
##     nest = TRUE, weights = ~WTMEC2YR)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          83.58652    2.05537  40.667  < 2e-16 ***
## BPDiaAve              0.49964    0.02623  19.047  < 2e-16 ***
## DiabetesYes          25.36616    3.56587   7.114 6.53e-08 ***
## BPDiaAve:DiabetesYes -0.22132    0.05120  -4.323 0.000156 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 279.1637)
## 
## Number of Fisher Scoring iterations: 2

Analyzing Survey Data in R

Manuela da Cruz Chadreque

1 de Abril de 2019

Introduction to survey data

What are survey weights?

Visualizing the weights

Specifying elements of the design in R

Designs in R

Designs in R

Stratified designs in R

Cluster designs in R

Comparing survey weights of different designs

Visualizing the impact of survey weights

NHANES weights

Exploring categorical data

Visualizing a categorical variable

Summarizing a categorical variable

Inference for quantitative data

Tying it all together!

Summarizing a categorical variable

Interpreting frequency tables

Graphing a categorical variable

Creating contingency tables

Building segments bar graphs

Inference for categorical variables

Running a chi squared test

Tying it all together!

Exploring quantitative data

Summarizing quantitative data

Survey statistics

Estimating quantiles

Visualizing quantitative data

Bar plots of survey-weighted means

Survey-weighted histograms

Survey-weighted density plots

Tying it all together!

Modeling quantitative data

Visualization with scatter plots

Bubble plots

Survey-weighted scatter plots

Line of best fit

Trend lines

Regression model

More complex modeling

Multiple linear regression

Multiple linear regression

Tying it all together