HW-05

library(tibble)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(haven)
library(ggplot2)
health <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Week-07/NHANES-2011-2012-Demo.xpt") 

race <- health$RIDRETH3 #race/ethnicity 
income <- health$INDFMPIR #ratio of family income to poverty
age <- health$DMDHRAGE #age

health$DMDCITZN [health$DMDCITZN==7] <- NA 
health$DMDCITZN [health$DMDCITZN==9] <- NA 
cit.sta  <- health$DMDCITZN
cit.sta  <- na.omit(cit.sta)
summary(cit.sta)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.107   1.000   2.000

health$DMDYRSUS [health$DMDYRSUS==77] <- NA 
health$DMDYRSUS [health$DMDYRSUS==99] <- NA 
time  <- health$DMDYRSUS
time  <- na.omit(time)
summary (time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   5.000   4.716   6.000   9.000

# bivariate relationship between the income and race, use boxplot, and describe the finding# 
# Mexican American have the lowest medium ratio of family income to poverty, whereas, Non-Hispanic Asian have the highest ratio of family income to poverty # 

health %>%
  ggplot(aes(x = factor(race), y = income)) +
  geom_boxplot()

## Warning: Removed 840 rows containing non-finite values (stat_boxplot).

# bivariate relationship between the income and age, use boxplot, and describe the finding# 
# x-axis = age, into groups # 
# Age between 43-48 have the highest ratio of family income, whereas, age between 18-26 have the lowest ratio of family income. We also observed that there is highest increase of family income from age 26 to age 28 # 

health %>%
  ggplot(aes(x = factor(age), y = income)) +
  geom_boxplot(mapping = aes(group = cut_number (age, 10)))

## Warning: Removed 840 rows containing non-finite values (stat_boxplot).

# Create a new categorical variable of immigration status by using citizenship status (DMDCITZN) and length of time in US DMDYRSUS # 
# new categorical variable should have 3 categories # 
# remove missing data for DMDCITZN & DMDYRSUS # 

names(health)

##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGY" "RIDEXAGM" "DMQMILIZ"
## [13] "DMQADFC"  "DMDBORN4" "DMDCITZN" "DMDYRSUS" "DMDEDUC3" "DMDEDUC2"
## [19] "DMDMARTL" "RIDEXPRG" "SIALANG"  "SIAPROXY" "SIAINTRP" "FIALANG" 
## [25] "FIAPROXY" "FIAINTRP" "MIALANG"  "MIAPROXY" "MIAINTRP" "AIALANGA"
## [31] "WTINT2YR" "WTMEC2YR" "SDMVPSU"  "SDMVSTRA" "INDHHIN2" "INDFMIN2"
## [37] "INDFMPIR" "DMDHHSIZ" "DMDFMSIZ" "DMDHHSZA" "DMDHHSZB" "DMDHHSZE"
## [43] "DMDHRGND" "DMDHRAGE" "DMDHRBR4" "DMDHREDU" "DMDHRMAR" "DMDHSEDU"

select(health, DMDCITZN, DMDYRSUS)

## # A tibble: 9,756 × 2
##    DMDCITZN DMDYRSUS
##       <dbl>    <dbl>
##  1        1       NA
##  2        1       NA
##  3        1       NA
##  4        1       NA
##  5        1       NA
##  6        1       NA
##  7        1       NA
##  8        1       NA
##  9        1       NA
## 10        1       NA
## # … with 9,746 more rows

health$DMDCITZN [health$DMDCITZN==7] <- NA 
health$DMDCITZN [health$DMDCITZN==9] <- NA 
cit.sta  <- health$DMDCITZN
cit.sta  <- na.omit(cit.sta)
summary(cit.sta)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.107   1.000   2.000

health$DMDYRSUS [health$DMDYRSUS==77] <- NA 
health$DMDYRSUS [health$DMDYRSUS==99] <- NA 
time  <- health$DMDYRSUS
time  <- na.omit(time)
summary (time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   5.000   4.716   6.000   9.000

select(health, DMDCITZN, DMDYRSUS)

## # A tibble: 9,756 × 2
##    DMDCITZN DMDYRSUS
##       <dbl>    <dbl>
##  1        1       NA
##  2        1       NA
##  3        1       NA
##  4        1       NA
##  5        1       NA
##  6        1       NA
##  7        1       NA
##  8        1       NA
##  9        1       NA
## 10        1       NA
## # … with 9,746 more rows

# the new subset # 
health.02 <- select(health, DMDCITZN, DMDYRSUS)

# US citizen # 
us.citizen <- filter (health.02, DMDCITZN == 1 & DMDYRSUS >= 1) 

# not a US citizen and having been in the US for less than 10 years # 
not.citizen.01 <- filter (health.02, DMDCITZN == 2 & DMDYRSUS <= 3)

# not a US citizen and having been in the US for 10 years or longer # 
not.citizen.02 <- filter (health.02, DMDCITZN == 2 & DMDYRSUS >= 4)

table (us.citizen)

##         DMDYRSUS
## DMDCITZN   1   2   3   4   5   6   7   8   9
##        1  10  30  84 129 115 250 198 110  59

table (us.citizen) %>%
 proportions() %>%
 round(2)

##         DMDYRSUS
## DMDCITZN    1    2    3    4    5    6    7    8    9
##        1 0.01 0.03 0.09 0.13 0.12 0.25 0.20 0.11 0.06

table (not.citizen.01)

##         DMDYRSUS
## DMDCITZN   1   2   3
##        2  70 227 218

table (not.citizen.01) %>%
 proportions() %>%
 round(2)

##         DMDYRSUS
## DMDCITZN    1    2    3
##        2 0.14 0.44 0.42

table (not.citizen.02)

##         DMDYRSUS
## DMDCITZN   4   5   6   7   8   9
##        2 215 104 110  51  15   2

table (not.citizen.02) %>%
 proportions() %>%
 round(2)

##         DMDYRSUS
## DMDCITZN    4    5    6    7    8    9
##        2 0.43 0.21 0.22 0.10 0.03 0.00

test <- aov( DMDCITZN ~ income, data = health)
summary(test)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## income         1    6.5   6.461   71.24 <2e-16 ***
## Residuals   8902  807.4   0.091                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 852 observations deleted due to missingness