setwd("C:/Users/maiam/Dropbox/PROFESSIONAL DEVELOPMENT/DATA SCIENCE/01_R/Analyzing Survey Data in R")
ce<-read.csv("ce.csv")
#head(ce)
# Load ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
# Construct a histogram of the weights
ggplot(data = ce, mapping = aes(x = FINLWT21)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#install.packages("ggplot2")
library(ggplot2)
#install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("survey")
library(survey)
## Warning: package 'survey' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
data(api)
# Look at the apisrs dataset
glimpse(apisrs)
## Observations: 200
## Variables: 39
## $ cds <chr> "15739081534155", "19642126066716", "30664493030640",...
## $ stype <fct> H, E, H, E, E, E, M, E, E, E, E, H, M, E, E, E, M, M,...
## $ name <chr> "McFarland High", "Stowers (Cecil ", "Brea-Olinda Hig...
## $ sname <chr> "McFarland High", "Stowers (Cecil B.) Elementary", "B...
## $ snum <dbl> 1039, 1124, 2868, 1273, 4926, 2463, 2031, 1736, 2142,...
## $ dname <chr> "McFarland Unified", "ABC Unified", "Brea-Olinda Unif...
## $ dnum <int> 432, 1, 79, 187, 640, 284, 401, 401, 470, 632, 401, 7...
## $ cname <chr> "Kern", "Los Angeles", "Orange", "Los Angeles", "San ...
## $ cnum <int> 14, 18, 29, 18, 39, 18, 18, 18, 18, 37, 18, 24, 14, 1...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 98, 100, 98, 99, 99, 93, 98, 99, 100, 90, 95, 100, 97...
## $ api00 <int> 462, 878, 734, 772, 739, 835, 456, 506, 543, 649, 556...
## $ api99 <int> 448, 831, 742, 657, 719, 822, 472, 474, 458, 604, 575...
## $ target <int> 18, NA, 3, 7, 4, NA, 16, 16, 17, 10, 11, 9, 14, 5, 15...
## $ growth <int> 14, 47, -8, 115, 20, 13, -16, 32, 85, 45, -19, 51, 4,...
## $ sch.wide <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, No, Ye...
## $ comp.imp <fct> Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Ye...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Yes...
## $ awards <fct> No, Yes, No, Yes, Yes, No, No, Yes, Yes, No, No, Yes,...
## $ meals <int> 44, 8, 10, 70, 43, 16, 81, 98, 94, 85, 81, 67, 77, 20...
## $ ell <int> 31, 25, 10, 25, 12, 19, 40, 65, 65, 57, 4, 25, 32, 16...
## $ yr.rnd <fct> NA, NA, NA, NA, NA, NA, NA, No, NA, NA, NA, NA, NA, N...
## $ mobility <int> 6, 15, 7, 23, 12, 13, 22, 43, 15, 10, 20, 12, 4, 32, ...
## $ acs.k3 <int> NA, 19, NA, 23, 20, 19, NA, 18, 19, 16, 16, NA, NA, 1...
## $ acs.46 <int> NA, 30, NA, NA, 29, 29, 30, 29, 32, 25, 27, NA, NA, 2...
## $ acs.core <int> 24, NA, 28, NA, NA, NA, 27, NA, NA, 30, NA, 17, 27, N...
## $ pct.resp <int> 82, 97, 95, 100, 91, 71, 49, 75, 99, 49, 62, 96, 77, ...
## $ not.hsg <int> 44, 4, 5, 37, 8, 1, 30, 49, 48, 23, 5, 44, 40, 4, 14,...
## $ hsg <int> 34, 10, 9, 40, 21, 8, 27, 31, 34, 36, 38, 19, 34, 14,...
## $ some.col <int> 12, 23, 21, 14, 27, 20, 18, 15, 14, 14, 29, 17, 16, 2...
## $ col.grad <int> 7, 43, 41, 8, 34, 38, 22, 2, 4, 21, 24, 19, 8, 37, 10...
## $ grad.sch <int> 3, 21, 24, 1, 10, 34, 2, 3, 1, 6, 5, 2, 2, 19, 1, 3, ...
## $ avg.ed <dbl> 1.91, 3.66, 3.71, 1.96, 3.17, 3.96, 2.39, 1.79, 1.77,...
## $ full <int> 71, 90, 83, 85, 100, 75, 72, 69, 68, 81, 84, 100, 89,...
## $ emer <int> 35, 10, 18, 18, 0, 20, 25, 22, 29, 7, 16, 0, 11, 5, 6...
## $ enroll <int> 477, 478, 1410, 342, 217, 258, 1274, 566, 645, 311, 3...
## $ api.stu <int> 429, 420, 1287, 291, 189, 211, 1090, 353, 563, 258, 2...
## $ pw <dbl> 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.9...
## $ fpc <dbl> 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194,...
# Specify a simple random sampling for apisrs
apisrs_design <- svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
# Produce a summary of the design
summary(apisrs_design)
## Independent Sampling design
## svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03229 0.03229 0.03229 0.03229 0.03229 0.03229
## Population size (PSUs): 6194
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Glimpse the data
glimpse(apistrat)
## Observations: 200
## Variables: 39
## $ cds <chr> "19647336097927", "19647336016018", "19648816021505",...
## $ stype <fct> E, E, E, E, E, E, E, E, E, E, M, M, H, M, H, E, E, M,...
## $ name <chr> "Open Magnet: Ce", "Belvedere Eleme", "Altadena Eleme...
## $ sname <chr> "Open Magnet: Center for Individual (Char", "Belveder...
## $ snum <dbl> 2077, 1622, 2236, 1921, 6140, 6077, 6071, 904, 4637, ...
## $ dname <chr> "Los Angeles Unified", "Los Angeles Unified", "Pasade...
## $ dnum <int> 401, 401, 541, 401, 460, 689, 689, 41, 702, 135, 590,...
## $ cname <chr> "Los Angeles", "Los Angeles", "Los Angeles", "Los Ang...
## $ cnum <int> 18, 18, 18, 18, 55, 55, 55, 14, 36, 36, 35, 32, 9, 1,...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 99, 100, 99, 100, 100, 100, 99, 98, 100, 100, 99, 99,...
## $ api00 <int> 840, 516, 531, 501, 720, 805, 778, 731, 592, 669, 496...
## $ api99 <int> 816, 476, 544, 457, 659, 780, 787, 731, 508, 658, 479...
## $ target <int> NA, 16, 13, 17, 7, 1, 1, 3, 15, 7, 16, 15, 17, 20, 13...
## $ growth <int> 24, 40, -13, 44, 61, 25, -9, 0, 84, 11, 17, 6, 7, 3, ...
## $ sch.wide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, N...
## $ comp.imp <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ awards <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ meals <int> 33, 98, 64, 83, 26, 7, 9, 45, 75, 47, 69, 60, 66, 54,...
## $ ell <int> 25, 77, 23, 63, 17, 0, 2, 2, 58, 23, 25, 10, 43, 26, ...
## $ yr.rnd <fct> No, Yes, No, No, No, No, No, No, Yes, No, No, No, No,...
## $ mobility <int> 11, 26, 17, 13, 31, 12, 10, 15, 23, 19, 26, 22, 16, 4...
## $ acs.k3 <int> 20, 19, 20, 17, 20, 19, 19, 19, 20, 18, NA, NA, NA, N...
## $ acs.46 <int> 29, 28, 30, 30, 30, 29, 31, 31, 32, 29, 32, 32, NA, 3...
## $ acs.core <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 30, 32, 27, 2...
## $ pct.resp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87, 67, 50, 70, 71, ...
## $ not.hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 49, 12, 20, 45, ...
## $ hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34, 20, 33, 20, 36, ...
## $ some.col <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 15, 23, 31, 11, ...
## $ col.grad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 29, 23, 8, 9...
## $ grad.sch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 3, 6, 0, 0, 11...
## $ avg.ed <dbl> 3.32, 1.67, 2.34, 1.86, 3.17, 3.64, 3.55, 3.10, 2.17,...
## $ full <int> 100, 57, 81, 64, 90, 95, 96, 93, 91, 96, 84, 65, 93, ...
## $ emer <int> 0, 40, 26, 24, 7, 0, 0, 8, 14, 0, 18, 37, 17, 26, 19,...
## $ enroll <int> 276, 841, 441, 298, 354, 330, 385, 583, 763, 381, 129...
## $ api.stu <int> 241, 631, 415, 288, 319, 315, 363, 510, 652, 322, 103...
## $ pw <dbl> 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.2...
## $ fpc <dbl> 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421,...
# Summarize strata sample sizes
apistrat %>%
count(stype)
## # A tibble: 3 x 2
## stype n
## <fct> <int>
## 1 E 100
## 2 H 50
## 3 M 50
# Specify the design
apistrat_design <- svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1, strata = ~stype)
# Look at the summary information stored in the design object
summary(apistrat_design)
## Stratified Independent Sampling design
## svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1,
## strata = ~stype)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02262 0.02262 0.03587 0.04014 0.05339 0.06623
## Stratum Sizes:
## E H M
## obs 100 50 50
## design.PSU 100 50 50
## actual.PSU 100 50 50
## Population stratum sizes (PSUs):
## E H M
## 4421 755 1018
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Glimpse the data
glimpse(apiclus2)
## Observations: 126
## Variables: 40
## $ cds <chr> "31667796031017", "55751846054837", "41688746043517",...
## $ stype <fct> E, E, E, M, E, E, E, E, M, H, E, M, E, E, E, E, H, E,...
## $ name <chr> "Alta-Dutch Flat", "Tenaya Elementa", "Panorama Eleme...
## $ sname <chr> "Alta-Dutch Flat Elementary", "Tenaya Elementary", "P...
## $ snum <dbl> 3269, 5979, 4958, 4957, 4956, 4915, 2548, 2550, 2549,...
## $ dname <chr> "Alta-Dutch Flat Elem", "Big Oak Flat-Grvlnd Unif", "...
## $ dnum <int> 15, 63, 83, 83, 83, 117, 132, 132, 132, 152, 152, 152...
## $ cname <chr> "Placer", "Tuolumne", "San Mateo", "San Mateo", "San ...
## $ cnum <int> 30, 54, 40, 40, 40, 39, 19, 19, 19, 5, 5, 5, 36, 36, ...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 100, 100, 98, 100, 98, 100, 100, 100, 100, 96, 98, 10...
## $ api00 <int> 821, 773, 600, 740, 716, 811, 472, 520, 568, 591, 544...
## $ api99 <int> 785, 718, 632, 740, 711, 779, 432, 494, 589, 585, 554...
## $ target <int> 1, 4, 8, 3, 4, 1, 18, 15, 11, 11, 12, 11, NA, NA, NA,...
## $ growth <int> 36, 55, -32, 0, 5, 32, 40, 26, -21, 6, -10, 29, 14, 2...
## $ sch.wide <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ comp.imp <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ both <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ awards <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ meals <int> 27, 43, 33, 11, 5, 25, 78, 76, 68, 42, 63, 54, 0, 4, ...
## $ ell <int> 0, 0, 5, 4, 2, 5, 38, 34, 34, 23, 42, 24, 3, 6, 2, 1,...
## $ yr.rnd <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ mobility <int> 14, 12, 9, 8, 6, 19, 13, 13, 15, 4, 15, 15, 24, 19, 1...
## $ acs.k3 <int> 17, 18, 19, NA, 18, 20, 19, 25, NA, NA, 20, NA, 19, 1...
## $ acs.46 <int> 20, 34, 29, 30, 28, 22, NA, 23, 24, NA, NA, 27, 27, 2...
## $ acs.core <int> NA, NA, NA, 24, NA, 31, NA, NA, 25, 21, NA, 18, NA, N...
## $ pct.resp <int> 89, 98, 79, 96, 98, 93, 100, 46, 91, 94, 93, 88, 90, ...
## $ not.hsg <int> 4, 8, 8, 5, 3, 5, 48, 30, 63, 20, 29, 27, 0, 1, 0, 1,...
## $ hsg <int> 16, 33, 28, 27, 14, 9, 32, 27, 16, 18, 32, 25, 0, 7, ...
## $ some.col <int> 53, 37, 30, 35, 22, 30, 15, 21, 13, 27, 26, 24, 4, 8,...
## $ col.grad <int> 21, 15, 32, 27, 58, 37, 4, 13, 6, 28, 7, 18, 51, 42, ...
## $ grad.sch <int> 6, 7, 1, 6, 3, 19, 1, 9, 2, 7, 6, 7, 44, 41, 100, 45,...
## $ avg.ed <dbl> 3.07, 2.79, 2.90, 3.03, 3.44, 3.56, 1.77, 2.42, 1.68,...
## $ full <int> 100, 100, 100, 82, 100, 94, 96, 86, 75, 100, 100, 97,...
## $ emer <int> 0, 0, 0, 18, 8, 6, 8, 24, 21, 4, 4, 3, 0, 4, 0, 4, 28...
## $ enroll <int> 152, 312, 173, 201, 147, 234, 184, 512, 543, 332, 217...
## $ api.stu <int> 120, 270, 151, 179, 136, 189, 158, 419, 423, 303, 182...
## $ pw <dbl> 18.925, 18.925, 18.925, 18.925, 18.925, 18.925, 18.92...
## $ fpc1 <dbl> 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757...
## $ fpc2 <int> <array[25]>
# Specify the design
apiclus_design <- svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw, fpc = ~fpc1 + fpc2)
#Look at the summary information stored in the design object
summary(apiclus_design)
## 2 - level Cluster Sampling design
## With (40, 126) clusters.
## svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw,
## fpc = ~fpc1 + fpc2)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.003669 0.037743 0.052840 0.042390 0.052840 0.052840
## Population size (PSUs): 757
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc1" "fpc2"
data(api)
# Construct histogram of pw, for the simple random sample, apisrs
ggplot(data = apisrs,
mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Construct histogram of pw, for the stratified sample, apistrat.
ggplot(data = apistrat,
mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Construct histogram of pw, for the cluster sample
ggplot(data = apiclus2,
mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(NHANES)
## Warning: package 'NHANES' was built under R version 3.4.4
data(NHANES)
#Create table of average survey weights by race
tab_weights <- NHANESraw %>%
group_by(Race1) %>%
summarize(avg_wt = mean(WTMEC2YR))
#Print the table
print(tab_weights)
## # A tibble: 5 x 2
## Race1 avg_wt
## <fct> <dbl>
## 1 Black 16052.
## 2 Hispanic 17158.
## 3 Mexican 16432.
## 4 White 52473.
## 5 Other 20233.
# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)
# Print summary of design
print(summary(NHANES_design))
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC2YR)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.493e-06 2.832e-05 5.271e-05 Inf 8.605e-05 Inf
## Stratum Sizes:
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## obs 803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862
## design.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103
## obs 998 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## actual.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## Data variables:
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeMonths" "Race1"
## [7] "Race3" "Education" "MaritalStatus"
## [10] "HHIncome" "HHIncomeMid" "Poverty"
## [13] "HomeRooms" "HomeOwn" "Work"
## [16] "Weight" "Length" "HeadCirc"
## [19] "Height" "BMI" "BMICatUnder20yrs"
## [22] "BMI_WHO" "Pulse" "BPSysAve"
## [25] "BPDiaAve" "BPSys1" "BPDia1"
## [28] "BPSys2" "BPDia2" "BPSys3"
## [31] "BPDia3" "Testosterone" "DirectChol"
## [34] "TotChol" "UrineVol1" "UrineFlow1"
## [37] "UrineVol2" "UrineFlow2" "Diabetes"
## [40] "DiabetesAge" "HealthGen" "DaysPhysHlthBad"
## [43] "DaysMentHlthBad" "LittleInterest" "Depressed"
## [46] "nPregnancies" "nBabies" "Age1stBaby"
## [49] "SleepHrsNight" "SleepTrouble" "PhysActive"
## [52] "PhysActiveDays" "TVHrsDay" "CompHrsDay"
## [55] "TVHrsDayChild" "CompHrsDayChild" "Alcohol12PlusYr"
## [58] "AlcoholDay" "AlcoholYear" "SmokeNow"
## [61] "Smoke100" "SmokeAge" "Marijuana"
## [64] "AgeFirstMarij" "RegularMarij" "AgeRegMarij"
## [67] "HardDrugs" "SexEver" "SexAge"
## [70] "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [73] "SexOrientation" "WTINT2YR" "WTMEC2YR"
## [76] "SDMVPSU" "SDMVSTRA" "PregnantNow"
# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)
# Print summary of design
summary(NHANES_design)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC2YR)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.493e-06 2.832e-05 5.271e-05 Inf 8.605e-05 Inf
## Stratum Sizes:
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## obs 803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862
## design.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103
## obs 998 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## actual.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## Data variables:
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeMonths" "Race1"
## [7] "Race3" "Education" "MaritalStatus"
## [10] "HHIncome" "HHIncomeMid" "Poverty"
## [13] "HomeRooms" "HomeOwn" "Work"
## [16] "Weight" "Length" "HeadCirc"
## [19] "Height" "BMI" "BMICatUnder20yrs"
## [22] "BMI_WHO" "Pulse" "BPSysAve"
## [25] "BPDiaAve" "BPSys1" "BPDia1"
## [28] "BPSys2" "BPDia2" "BPSys3"
## [31] "BPDia3" "Testosterone" "DirectChol"
## [34] "TotChol" "UrineVol1" "UrineFlow1"
## [37] "UrineVol2" "UrineFlow2" "Diabetes"
## [40] "DiabetesAge" "HealthGen" "DaysPhysHlthBad"
## [43] "DaysMentHlthBad" "LittleInterest" "Depressed"
## [46] "nPregnancies" "nBabies" "Age1stBaby"
## [49] "SleepHrsNight" "SleepTrouble" "PhysActive"
## [52] "PhysActiveDays" "TVHrsDay" "CompHrsDay"
## [55] "TVHrsDayChild" "CompHrsDayChild" "Alcohol12PlusYr"
## [58] "AlcoholDay" "AlcoholYear" "SmokeNow"
## [61] "Smoke100" "SmokeAge" "Marijuana"
## [64] "AgeFirstMarij" "RegularMarij" "AgeRegMarij"
## [67] "HardDrugs" "SexEver" "SexAge"
## [70] "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [73] "SexOrientation" "WTINT2YR" "WTMEC2YR"
## [76] "SDMVPSU" "SDMVSTRA" "PregnantNow"
# Number of clusters
NHANESraw %>%
summarize(n_clusters = n_distinct(SDMVSTRA, SDMVPSU))
## # A tibble: 1 x 1
## n_clusters
## <int>
## 1 62
# Sample sizes in clusters
NHANESraw %>%
count(SDMVSTRA, SDMVPSU)
## # A tibble: 62 x 3
## SDMVSTRA SDMVPSU n
## <int> <int> <int>
## 1 75 1 379
## 2 75 2 424
## 3 76 1 419
## 4 76 2 366
## 5 77 1 441
## 6 77 2 382
## 7 78 1 378
## 8 78 2 451
## 9 79 1 349
## 10 79 2 347
## # ... with 52 more rows
# Specify the survey design
NHANESraw <- mutate(NHANESraw, WTMEC4YR = .5 * WTMEC2YR)
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC2YR)
# Determine the levels of Depressed
levels(NHANESraw$Depressed)
## [1] "None" "Several" "Most"
# Construct a frequency table of Depressed
tab_w <- svytable(~Depressed, design = NHANES_design)
# Determine class of tab_w
class(tab_w)
## [1] "svytable" "xtabs" "table"
# Display tab_w
tab_w
## Depressed
## None Several Most
## 317517218 65465017 25408883
# Add proportions to table
tab_w <- tab_w %>%
as.data.frame() %>%
mutate(Prop = Freq/sum(Freq))
# Create a barplot
ggplot(data = tab_w,
mapping = aes(x = Depressed, y = Prop)) +
geom_col()
##Exploring two categorical variables
# Construct and display a frequency table
tab_D <- svytable(~Depressed,
design = NHANES_design)
tab_D
## Depressed
## None Several Most
## 317517218 65465017 25408883
# Construct and display a frequency table
tab_H <- svytable(~HealthGen,
design = NHANES_design)
tab_H
## HealthGen
## Excellent Vgood Good Fair Poor
## 55319909 154964339 174995171 63088061 11336968
# Construct and display a frequency table
tab_DH <- svytable(~Depressed+HealthGen,
design = NHANES_design)
tab_DH
## HealthGen
## Depressed Excellent Vgood Good Fair Poor
## None 42654363 114974637 119840064 35381566 4649890
## Several 3741242 16604989 27900937 14710210 2507639
## Most 1127227 3711730 9397896 7871011 3301019
# Add conditional proportions to tab_DH
tab_DH_cond <- tab_DH %>%
as.data.frame() %>%
group_by(HealthGen) %>%
mutate(n_HealthGen = sum(Freq), Prop_Depressed = Freq/n_HealthGen) %>%
ungroup()
# Print tab_DH_cond
tab_DH_cond
## # A tibble: 15 x 5
## Depressed HealthGen Freq n_HealthGen Prop_Depressed
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 None Excellent 42654363. 47522832. 0.898
## 2 Several Excellent 3741242. 47522832. 0.0787
## 3 Most Excellent 1127227. 47522832. 0.0237
## 4 None Vgood 114974637. 135291355. 0.850
## 5 Several Vgood 16604989. 135291355. 0.123
## 6 Most Vgood 3711730. 135291355. 0.0274
## 7 None Good 119840064. 157138897. 0.763
## 8 Several Good 27900937. 157138897. 0.178
## 9 Most Good 9397896. 157138897. 0.0598
## 10 None Fair 35381566. 57962787. 0.610
## 11 Several Fair 14710210. 57962787. 0.254
## 12 Most Fair 7871011. 57962787. 0.136
## 13 None Poor 4649890. 10458548. 0.445
## 14 Several Poor 2507639. 10458548. 0.240
## 15 Most Poor 3301019. 10458548. 0.316
# Create a segmented bar graph of the conditional proportions in tab_DH_cond
ggplot(data = tab_DH_cond,
mapping = aes(x = HealthGen, y = Prop_Depressed, fill = Depressed)) +
geom_col() +
coord_flip()
###Summarizing with svytotal()
# Estimate the totals for combos of Depressed and HealthGen
tab_totals <- svytotal(x = ~interaction(Depressed, HealthGen),
design = NHANES_design,
na.rm = TRUE)
# Print table of totals
print(tab_totals)
## total SE
## interaction(Depressed, HealthGen)None.Excellent 42654363 3112536
## interaction(Depressed, HealthGen)Several.Excellent 3741242 554395
## interaction(Depressed, HealthGen)Most.Excellent 1127227 279379
## interaction(Depressed, HealthGen)None.Vgood 114974637 5951612
## interaction(Depressed, HealthGen)Several.Vgood 16604989 1374040
## interaction(Depressed, HealthGen)Most.Vgood 3711730 539941
## interaction(Depressed, HealthGen)None.Good 119840064 6750136
## interaction(Depressed, HealthGen)Several.Good 27900937 1862155
## interaction(Depressed, HealthGen)Most.Good 9397896 1002210
## interaction(Depressed, HealthGen)None.Fair 35381566 2412614
## interaction(Depressed, HealthGen)Several.Fair 14710210 910727
## interaction(Depressed, HealthGen)Most.Fair 7871011 740513
## interaction(Depressed, HealthGen)None.Poor 4649890 503867
## interaction(Depressed, HealthGen)Several.Poor 2507639 336880
## interaction(Depressed, HealthGen)Most.Poor 3301019 390272
# Estimate the means for combos of Depressed and HealthGen
tab_means <- svymean(x = ~interaction(Depressed, HealthGen),
design = NHANES_design,
na.rm = TRUE)
# Print table of means
tab_means
## mean SE
## interaction(Depressed, HealthGen)None.Excellent 0.1044492 0.0053
## interaction(Depressed, HealthGen)Several.Excellent 0.0091613 0.0014
## interaction(Depressed, HealthGen)Most.Excellent 0.0027603 0.0007
## interaction(Depressed, HealthGen)None.Vgood 0.2815422 0.0078
## interaction(Depressed, HealthGen)Several.Vgood 0.0406612 0.0028
## interaction(Depressed, HealthGen)Most.Vgood 0.0090890 0.0013
## interaction(Depressed, HealthGen)None.Good 0.2934563 0.0092
## interaction(Depressed, HealthGen)Several.Good 0.0683220 0.0033
## interaction(Depressed, HealthGen)Most.Good 0.0230129 0.0023
## interaction(Depressed, HealthGen)None.Fair 0.0866400 0.0047
## interaction(Depressed, HealthGen)Several.Fair 0.0360214 0.0026
## interaction(Depressed, HealthGen)Most.Fair 0.0192740 0.0019
## interaction(Depressed, HealthGen)None.Poor 0.0113863 0.0013
## interaction(Depressed, HealthGen)Several.Poor 0.0061405 0.0009
## interaction(Depressed, HealthGen)Most.Poor 0.0080833 0.0010
# Run a chi square test between Depressed and HealthGen
svychisq(~Depressed+HealthGen,
design = NHANES_design,
statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
## X-squared = 1592.7, df = 8, p-value < 2.2e-16
# Construct a contingency table
tab <-svytable(~HomeOwn+Education, design = NHANES_design)
# Add conditional proportion of levels of HomeOwn for each educational level
tab_df <- as.data.frame(tab) %>%
group_by(Education) %>%
mutate(n_Education = sum(Freq), Prop_HomeOwn = Freq/n_Education) %>%
ungroup()
# Create a segmented bar graph
ggplot(data = tab_df,
mapping = aes(x = Education, y = Prop_HomeOwn, fill = HomeOwn)) +
geom_col() +
coord_flip()
# Run a chi square test
svychisq(~HomeOwn+Education,
design = NHANES_design,
statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~HomeOwn + Education, design = NHANES_design, statistic = "Chisq")
## X-squared = 531.78, df = 8, p-value = 2.669e-16
# Compute the survey-weighted mean
svymean(x = ~SleepHrsNight,
design = NHANES_design,
na.rm = TRUE)
## mean SE
## SleepHrsNight 6.9292 0.0166
# Compute the survey-weighted mean by Gender
svyby(formula = ~SleepHrsNight,
by = ~Gender,
design = NHANES_design,
FUN = svymean,
na.rm = TRUE,
keep.names = FALSE)
## Gender SleepHrsNight se
## 1 female 6.976103 0.02374684
## 2 male 6.879050 0.01953263
# Compute the survey-weighted quantiles
svyquantile(x = ~SleepHrsNight,
design = NHANES_design,
na.rm = TRUE,
quantiles = c(0.01, 0.25, 0.5, 0.75,0.99))
## 0.01 0.25 0.5 0.75 0.99
## SleepHrsNight 4 6 7 8 10
# Compute the survey-weighted quantiles by Gender
svyby(formula = ~SleepHrsNight,
by = ~Gender,
design = NHANES_design,
FUN = svyquantile,
na.rm = TRUE,
quantiles = 0.5,
keep.rows = FALSE,
keep.var = FALSE)
## Gender statistic
## female female 7
## male male 7
# Compute the survey-weighted mean by Gender
out <- svyby(formula = ~SleepHrsNight,
by = ~Gender,
design = NHANES_design,
FUN = svymean,
na.rm = TRUE,
keep.names = FALSE)
# Construct a bar plot of average sleep by gender
ggplot(data =out , mapping = aes(x=Gender, y=SleepHrsNight)) +
geom_col() +
labs(y="Average Nightly Sleep")
out
## Gender SleepHrsNight se
## 1 female 6.976103 0.02374684
## 2 male 6.879050 0.01953263
select(NHANESraw, SleepHrsNight, Gender)
## # A tibble: 20,293 x 2
## SleepHrsNight Gender
## <int> <fct>
## 1 4 male
## 2 NA male
## 3 8 male
## 4 NA male
## 5 4 female
## 6 4 male
## 7 8 female
## 8 NA female
## 9 NA male
## 10 6 male
## # ... with 20,283 more rows
The most common response among surveyed males was 6.879 hours of sleep. ###Bar plots with error
# Add lower and upper columns to out
out_col <- mutate(out,
lower = SleepHrsNight - 2*se,
upper = SleepHrsNight + 2*se)
# Construct a bar plot of average sleep by gender with error bars
ggplot(data = out_col,
mapping = aes(x = Gender, y = SleepHrsNight,
ymin = lower, ymax = upper)) +
geom_col(fill = "gold") +
labs(y = "Average Nightly Sleep") +
geom_errorbar(width = 0.7)
# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) +
geom_histogram(binwidth = 1,
color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) +
geom_histogram(binwidth = 0.5,
color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw,
mapping = aes(x=SleepHrsNight,weight=WTMEC4YR)) +
geom_histogram(binwidth = 2,
color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Density plot of sleep faceted by gender
NHANESraw %>%
filter(!is.na(SleepHrsNight), !is.na(Gender)) %>%
group_by(Gender) %>%
mutate(WTMEC4YR_std = WTMEC4YR/sum(WTMEC4YR)) %>%
ggplot(mapping = aes(x = SleepHrsNight, weight = WTMEC4YR_std)) +
geom_density(bw = 0.6, fill = "gold") +
labs(x = "Hours of Sleep") +
facet_wrap(~Gender, labeller = "label_both")
##Inference for quantitative data ###Survey-weighted t-test
# Run a survey-weighted t-test
svyttest(formula = SleepHrsNight~Gender,
design = NHANES_design)
## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion
## Warning in summary.glm(glm.object): observations with zero weight not used
## for calculating dispersion
##
## Design-based t-test
##
## data: SleepHrsNight ~ Gender
## t = -3.4077, df = 32, p-value = 0.001785
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.15287218 -0.04123256
## sample estimates:
## difference in mean
## -0.09705237
# Find means of total cholesterol by whether or not active
out <- svyby(formula =~TotChol,
by = ~PhysActive,
design = NHANES_design,
FUN = svymean,
na.rm = TRUE,
keep.names = FALSE)
# Construct a bar plot of means of total cholesterol by whether or not active
ggplot(data = out,
mapping = aes(x=PhysActive, y=TotChol)) +
geom_col()
# Create dataset with only 20 year olds
NHANES20 <- filter(NHANESraw,
Age == 20)
# Construct scatter plot
ggplot(data = NHANES20,
mapping = aes(x=Height, y=Weight)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct bubble plot
ggplot(data = NHANES20,
mapping = aes(x=Height, y=Weight, size=WTMEC4YR)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Using the dataset, NHANES20, create a scatter plot where Height is mapped to #x, Weight is mapped to y, and WTMEC4YR to color. Don't include a legend.
# Construct a scatter plot
ggplot(data = NHANES20,
mapping = aes(x= Height, y=Weight, color=WTMEC4YR)) +
geom_point() +
guides(color = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Using the dataset, NHANES20, create a scatter plot where Height is mapped to #x, Weight is mapped to y, and WTMEC4YR to alpha. Don't include a legend.
# Construct a scatter plot
ggplot(data = NHANES20,
mapping = aes(x= Height, y=Weight, alpha=WTMEC4YR)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Create a bubble plot of NHANES20. Map Height to x, Weight to y, WTMEC4YR to size, and Gender to color. Set alpha to 0.3 and include a legend for color but not size.
# Add gender to plot
ggplot(data = NHANES20,
mapping = aes(x=Height, y=Weight, size=WTMEC4YR, color=Gender)) +
geom_point(alpha=0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Create a scatter plot of NHANES20. Map Height to x, Weight to y, WTMEC4YR to alpha, and Gender to color. Include a legend for color but not for alpha.
# Add gender to plot
ggplot(data = NHANES20,
mapping = aes(x=Height, y=Weight,alpha=WTMEC4YR, color=Gender)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
##Visualizing trends
What’s the relationship between height and weight across all ages?
# Bubble plot with linear of best fit
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
#Create a bubble plot of Height (as x) and Weight (as y) in NHANESraw. Add a line of best fit using geom_smooth() with method = "lm" and mapping WTMEC4YR to weight
# Add quadratic curve and cubic curve
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR)) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 2), color = "orange") +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 3), color = "red")
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
Nice trend curves! Notice that even with the non-linear curves, we are missing the trend for certain heights. This signifies that it is difficult to specify a global trend for these variables.
#To the given bubble plot, add trend lines with linetype = 2 so that the lines are dashed.
# Add non-survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2)
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
#Now add the survey-weighted trend lines by mapping WTMEC4YR to weight.
# Add non-survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2)
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Add survey-weighted trend lines
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
##Modeling survey data
# Subset survey design object to only include 20 year olds
NHANES20_design <- subset(NHANES_design, Age == 20)
# Build a linear regression model
mod <- svyglm(Weight ~ Height, design = NHANES20_design)
# Print summary of the model
summary(mod)
##
## Call:
## svyglm(formula = Weight ~ Height, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -67.2571 22.9836 -2.926 0.00674 **
## Height 0.8305 0.1368 6.072 1.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 326.6108)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model same slope
mod1 <- svyglm(Weight ~ Height+Gender, design = NHANES20_design)
# Print summary of the same slope model
summary(mod1)
##
## Call:
## svyglm(formula = Weight ~ Height + Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.8665 22.7622 -2.366 0.0254 *
## Height 0.7434 0.1391 5.346 1.2e-05 ***
## Gendermale 2.7207 3.2471 0.838 0.4095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 325.3881)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model different slopes
mod2 <- svyglm(Weight ~ Height*Gender, design = NHANES20_design)
# Print summary of the different slopes model
summary(mod2)
##
## Call:
## svyglm(formula = Weight ~ Height * Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.5061 21.5357 0.441 0.66257
## Height 0.3565 0.1269 2.809 0.00932 **
## Gendermale -131.0884 41.9989 -3.121 0.00438 **
## Height:Gendermale 0.7897 0.2385 3.311 0.00273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 316.5007)
##
## Number of Fisher Scoring iterations: 2
# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x=BPDiaAve, y=BPSysAve, size=WTMEC4YR, color=Diabetes)) +
geom_point(alpha = 0.2) +
guides(size = FALSE) +
geom_smooth(method="lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4600 rows containing non-finite values (stat_smooth).
## Warning: Removed 4600 rows containing missing values (geom_point).
# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)
# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x = BPDiaAve, y = BPSysAve, size = WTMEC4YR, color = Diabetes)) +
geom_point(alpha = 0.2) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR))
## Warning: Removed 4600 rows containing non-finite values (stat_smooth).
## Warning: Removed 4600 rows containing missing values (geom_point).
# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)
# Build model with different slopes
mod2 <- svyglm(BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)
# Summarize models
summary(mod1)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC2YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.74311 1.86920 45.87 <2e-16 ***
## BPDiaAve 0.48150 0.02354 20.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 290.3472)
##
## Number of Fisher Scoring iterations: 2
summary(mod2)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC2YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.58652 2.05537 40.667 < 2e-16 ***
## BPDiaAve 0.49964 0.02623 19.047 < 2e-16 ***
## DiabetesYes 25.36616 3.56587 7.114 6.53e-08 ***
## BPDiaAve:DiabetesYes -0.22132 0.05120 -4.323 0.000156 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 279.1637)
##
## Number of Fisher Scoring iterations: 2