library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
# loading data #
demo_99.20 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-1999-2000-Demo.XPT")
demo_01.02 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2001-2002-Demo.XPT")
demo_03.04 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2003-2004-Demo.XPT")
demo_05.06 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2005-2006-Demo.XPT")
demo_07.08 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2007-2008-Demo.XPT")
demo_09.10 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2009-2010-Demo.XPT")
demo_11.12 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2011-2012-Demo.XPT")
demo_13.14 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2013-2014-Demo.XPT")
demo_15.16 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2015-2016-Demo.XPT")
demo_17.18 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2017-2018-Demo.XPT")
bmi.01 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-1999-2000-BMX.XPT")
bmi.02 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2001-2002-BMX.XPT")
bmi.03 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2003-2004-BMX.XPT")
bmi.04 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2005-2006-BMX.XPT")
bmi.05 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2007-2008-BMX.XPT")
bmi.06 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2009-2010-BMX.XPT")
bmi.07 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2011-2012-BMX.XPT")
bmi.08 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2013-2014-BMX.XPT")
bmi.09 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2015-2016-BMX.XPT")
bmi.10 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2017-2018-BMX.XPT")
Examine the trend in family income-to-poverty ratio
- visualized in boxplot form
- describe finding
- calculate the average family income-to-poverty ratio
- one-way ANOVA test
# boxplot#
ggplot (df, aes(x=factor(SDDSRVYR), y=INDFMPIR)) +
geom_boxplot()+
labs(y="family income-to-poverty ratio",
x = "survey wave")
## Warning: Removed 9196 rows containing non-finite values (stat_boxplot).

# describe finding #
# The lowest family income-to-poverty ratio occurred during 2003-2004, and the highest family income-to-poverty ratio peaked shortly after 2005-2006. There is a slow decline from 2010 to 2010.
# calculate the average family income-to-poverty ratio #
# 2005 to 2006 share a similar family income-to-poverty ratio to 2017 to 2018. Otherwise, the average family income-to-poverty ratio do not have a drastic change.
df %>%
group_by (SDDSRVYR) %>%
summarize (IPR_avg = mean (INDFMPIR, na.rm=TRUE))
## # A tibble: 10 × 2
## SDDSRVYR IPR_avg
## <dbl> <dbl>
## 1 1 2.18
## 2 2 2.34
## 3 3 2.21
## 4 4 2.37
## 5 5 2.28
## 6 6 2.23
## 7 7 2.21
## 8 8 2.25
## 9 9 2.27
## 10 10 2.38
# one way Anova
# there is a significant difference between family income-to-poverty ratio and release year
anova <- aov(INDFMPIR ~ factor(SDDSRVYR), data = df)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(SDDSRVYR) 9 404 44.90 17.54 <2e-16 ***
## Residuals 92110 235821 2.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9196 observations deleted due to missingness
Examine the trend in family income-to-poverty ratio
- change variable name
- recode RIDRETH1 to new group
- Tabulate the new frequency distribution
- Calculate the average family income-to-poverty ratio (wide-form)
- examine the trend in graph.
df.new <- df %>%
mutate(race = case_when(
RIDRETH1 == 1 ~ "Hispanic",
RIDRETH1 == 2 ~ "Hispanic",
RIDRETH1 == 3 ~ "White",
RIDRETH1 == 4 ~ "Black",
RIDRETH1 == 5 ~ "Other",
TRUE ~ NA_character_))
demo.race <- table(df.new$race)
demo.race
##
## Black Hispanic Other White
## 23644 30743 9497 37432
df %>%
group_by(df.new$race, SDDSRVYR) %>%
summarize(average = mean(INDFMPIR, na.rm=TRUE)) %>%
pivot_wider(names_from = SDDSRVYR, values_from = average)
## `summarise()` has grouped output by 'df.new$race'. You can override using the `.groups` argument.
## # A tibble: 4 × 11
## # Groups: df.new$race [4]
## `df.new$race` `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Black 1.93 1.84 1.87 2.13 2.19 1.98 1.92 1.86 1.91 2.05
## 2 Hispanic 1.61 1.82 1.68 1.76 1.83 1.68 1.70 1.76 1.78 1.94
## 3 Other 2.33 2.50 2.31 2.55 2.49 2.13 2.78 2.82 2.67 2.93
## 4 White 2.95 2.98 2.79 2.99 2.66 2.75 2.54 2.59 2.79 2.55
# The highest family income-to-poverty can be found in the White group. The The median family income-to-poverty is lower for Hispanic followed by Black compare to the white.
ggplot(df, aes(x=factor(SDDSRVYR), y=INDFMPIR)) +
geom_boxplot()+
facet_wrap (~as.factor(df.new$race))+
labs (y="family income-to-poverty ratio", x = "survey wave")
## Warning: Removed 9196 rows containing non-finite values (stat_boxplot).

body measure data
- Append the 10 files of the body measure data together into a long-form
- recode RIDRETH1 to new group
- Recode body mass index ( BMXBMI ) into a new factor
- normal weight - body mass index < 25;
- overweight - body mass index >= 25, but < 30;
- obesity - body mass index >= 30.
- plotting
- cross tab
- ChiSQ test
df.bmi <- bind_rows(bmi.01, bmi.02, bmi.03, bmi.04, bmi.05,bmi.06,bmi.07,bmi.08,bmi.09,bmi.10)
bmi <- as_tibble(df.bmi[c("SEQN", "BMXBMI")])
head(bmi)
## # A tibble: 6 × 2
## SEQN BMXBMI
## <dbl> <dbl>
## 1 1 14.9
## 2 2 24.9
## 3 3 17.6
## 4 4 NA
## 5 5 29.1
## 6 6 22.6
bmi.demo <- df %>%
full_join(df.bmi, by = "SEQN") %>%
summarize(SEQN, SDDSRVYR, df.new$race, BMXBMI)
head(bmi.demo)
## # A tibble: 6 × 4
## SEQN SDDSRVYR `df.new$race` BMXBMI
## <dbl> <dbl> <chr> <dbl>
## 1 1 1 Black 14.9
## 2 2 1 White 24.9
## 3 3 1 White 17.6
## 4 4 1 Black NA
## 5 5 1 White 29.1
## 6 6 1 Other 22.6
bmi.demo <- bmi.demo %>%
mutate("weight" = case_when(
BMXBMI < 25 ~ "normal",
BMXBMI >= 25 & BMXBMI < 30 ~ "overweight",
BMXBMI >= 30 ~ "obesity"
))
table(bmi.demo$weight)
##
## normal obesity overweight
## 45002 21500 21297
#cross tab
cross.tab <- table(bmi.demo$weight, df.new$race)
proportions (as.matrix(cross.tab), 2)
##
## Black Hispanic Other White
## normal 0.5147572 0.5028935 0.6455142 0.4853108
## obesity 0.2812724 0.2397578 0.1391928 0.2523101
## overweight 0.2039703 0.2573487 0.2152930 0.2623791
#chisq test
chisq.test (bmi.demo$weight, df.new$race, correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: bmi.demo$weight and df.new$race
## X-squared = 1064.1, df = 6, p-value < 2.2e-16
#plotting
ggplot(bmi.demo, aes(x=factor(SDDSRVYR), fill = weight)) +
geom_bar(position = "fill") +
facet_wrap (~as.factor(df.new$race))+
labs(y="proportion", x = "survey wave")
