Final Exam Data333

library(tibble)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(haven)
library(ggplot2)

# loading data # 
demo_99.20 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-1999-2000-Demo.XPT")
demo_01.02 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2001-2002-Demo.XPT")
demo_03.04 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2003-2004-Demo.XPT")
demo_05.06 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2005-2006-Demo.XPT")
demo_07.08 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2007-2008-Demo.XPT")
demo_09.10 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2009-2010-Demo.XPT")
demo_11.12 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2011-2012-Demo.XPT")
demo_13.14 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2013-2014-Demo.XPT")
demo_15.16 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2015-2016-Demo.XPT")
demo_17.18 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2017-2018-Demo.XPT")

bmi.01 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-1999-2000-BMX.XPT")
bmi.02 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2001-2002-BMX.XPT")
bmi.03 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2003-2004-BMX.XPT")
bmi.04 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2005-2006-BMX.XPT")
bmi.05 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2007-2008-BMX.XPT")
bmi.06 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2009-2010-BMX.XPT")
bmi.07 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2011-2012-BMX.XPT")
bmi.08 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2013-2014-BMX.XPT")
bmi.09 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2015-2016-BMX.XPT")
bmi.10 <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Final/NHANES Data/NHANES-2017-2018-BMX.XPT")

Append the 10 files of the demographic data together into a long form

Append the 10 files of the demographic data together into a long form
tabulate the sample size in each wave

#merge for sample size using SDDSRVYR# 
df <- bind_rows (demo_99.20, demo_01.02, demo_03.04, demo_05.06, demo_07.08, demo_09.10, demo_11.12, demo_13.14, demo_15.16, demo_17.18)
table(df$SDDSRVYR)

## 
##     1     2     3     4     5     6     7     8     9    10 
##  9965 11039 10122 10348 10149 10537  9756 10175  9971  9254

Examine the trend in family income-to-poverty ratio

visualized in boxplot form
describe finding
calculate the average family income-to-poverty ratio
one-way ANOVA test

# boxplot# 
ggplot (df, aes(x=factor(SDDSRVYR), y=INDFMPIR)) + 
  geom_boxplot()+
  labs(y="family income-to-poverty ratio", 
       x = "survey wave")

## Warning: Removed 9196 rows containing non-finite values (stat_boxplot).

# describe finding # 
# The lowest family income-to-poverty ratio occurred during 2003-2004, and the highest family income-to-poverty ratio peaked shortly after 2005-2006. There is a slow decline from 2010 to 2010.  

# calculate the average family income-to-poverty ratio # 
# 2005 to 2006 share a similar family income-to-poverty ratio to 2017 to 2018. Otherwise, the average family income-to-poverty ratio do not have a drastic change.
df %>%
  group_by (SDDSRVYR) %>%
  summarize (IPR_avg = mean (INDFMPIR, na.rm=TRUE))

## # A tibble: 10 × 2
##    SDDSRVYR IPR_avg
##       <dbl>   <dbl>
##  1        1    2.18
##  2        2    2.34
##  3        3    2.21
##  4        4    2.37
##  5        5    2.28
##  6        6    2.23
##  7        7    2.21
##  8        8    2.25
##  9        9    2.27
## 10       10    2.38

# one way Anova 
# there is a significant difference between family income-to-poverty ratio and release year
anova <- aov(INDFMPIR ~ factor(SDDSRVYR), data = df)
summary(anova)

##                     Df Sum Sq Mean Sq F value Pr(>F)    
## factor(SDDSRVYR)     9    404   44.90   17.54 <2e-16 ***
## Residuals        92110 235821    2.56                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9196 observations deleted due to missingness

Examine the trend in family income-to-poverty ratio

change variable name
recode RIDRETH1 to new group
Tabulate the new frequency distribution
Calculate the average family income-to-poverty ratio (wide-form)
examine the trend in graph.

df.new <- df %>%
  mutate(race = case_when(
    RIDRETH1 == 1 ~ "Hispanic",
    RIDRETH1 == 2 ~ "Hispanic",
    RIDRETH1 == 3 ~ "White",
    RIDRETH1 == 4 ~ "Black",
    RIDRETH1 == 5 ~ "Other",
    TRUE ~ NA_character_))
demo.race <- table(df.new$race)
demo.race

## 
##    Black Hispanic    Other    White 
##    23644    30743     9497    37432

df %>%
  group_by(df.new$race, SDDSRVYR) %>%
  summarize(average = mean(INDFMPIR, na.rm=TRUE)) %>%
  pivot_wider(names_from = SDDSRVYR, values_from = average)

## `summarise()` has grouped output by 'df.new$race'. You can override using the `.groups` argument.

## # A tibble: 4 × 11
## # Groups:   df.new$race [4]
##   `df.new$race`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
##   <chr>         <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Black          1.93  1.84  1.87  2.13  2.19  1.98  1.92  1.86  1.91  2.05
## 2 Hispanic       1.61  1.82  1.68  1.76  1.83  1.68  1.70  1.76  1.78  1.94
## 3 Other          2.33  2.50  2.31  2.55  2.49  2.13  2.78  2.82  2.67  2.93
## 4 White          2.95  2.98  2.79  2.99  2.66  2.75  2.54  2.59  2.79  2.55

# The highest family income-to-poverty can be found in the White group. The The median family income-to-poverty is lower for Hispanic followed by Black compare to the white. 
ggplot(df, aes(x=factor(SDDSRVYR), y=INDFMPIR)) + 
  geom_boxplot()+
  facet_wrap (~as.factor(df.new$race))+
  labs (y="family income-to-poverty ratio", x = "survey wave")

## Warning: Removed 9196 rows containing non-finite values (stat_boxplot).

body measure data

Append the 10 files of the body measure data together into a long-form
recode RIDRETH1 to new group
Recode body mass index ( BMXBMI ) into a new factor
1. normal weight - body mass index < 25;
2. overweight - body mass index >= 25, but < 30;
3. obesity - body mass index >= 30.
plotting
cross tab
ChiSQ test

df.bmi <- bind_rows(bmi.01, bmi.02, bmi.03, bmi.04, bmi.05,bmi.06,bmi.07,bmi.08,bmi.09,bmi.10)
bmi <- as_tibble(df.bmi[c("SEQN", "BMXBMI")])
head(bmi)

## # A tibble: 6 × 2
##    SEQN BMXBMI
##   <dbl>  <dbl>
## 1     1   14.9
## 2     2   24.9
## 3     3   17.6
## 4     4   NA  
## 5     5   29.1
## 6     6   22.6

bmi.demo <- df %>%
  full_join(df.bmi, by = "SEQN") %>%
  summarize(SEQN, SDDSRVYR, df.new$race, BMXBMI)
head(bmi.demo)

## # A tibble: 6 × 4
##    SEQN SDDSRVYR `df.new$race` BMXBMI
##   <dbl>    <dbl> <chr>          <dbl>
## 1     1        1 Black           14.9
## 2     2        1 White           24.9
## 3     3        1 White           17.6
## 4     4        1 Black           NA  
## 5     5        1 White           29.1
## 6     6        1 Other           22.6

bmi.demo <- bmi.demo %>% 
            mutate("weight" = case_when(
            BMXBMI < 25 ~ "normal",
            BMXBMI >= 25 & BMXBMI < 30 ~ "overweight",
            BMXBMI >= 30 ~ "obesity"
            ))
table(bmi.demo$weight)

## 
##     normal    obesity overweight 
##      45002      21500      21297

#cross tab
cross.tab <- table(bmi.demo$weight, df.new$race)
proportions (as.matrix(cross.tab), 2)

##             
##                  Black  Hispanic     Other     White
##   normal     0.5147572 0.5028935 0.6455142 0.4853108
##   obesity    0.2812724 0.2397578 0.1391928 0.2523101
##   overweight 0.2039703 0.2573487 0.2152930 0.2623791

#chisq test 
chisq.test (bmi.demo$weight, df.new$race, correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  bmi.demo$weight and df.new$race
## X-squared = 1064.1, df = 6, p-value < 2.2e-16

#plotting 
ggplot(bmi.demo, aes(x=factor(SDDSRVYR), fill = weight)) + 
  geom_bar(position = "fill") +
  facet_wrap (~as.factor(df.new$race))+
  labs(y="proportion", x = "survey wave")