library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
health <- read_xpt("/Volumes/Y.NI/Class/Fall' 21/DATA 333/Week-07/NHANES-2011-2012-Demo.xpt")
race <- health$RIDRETH3 #race/ethnicity
income <- health$INDFMPIR #ratio of family income to poverty
age <- health$DMDHRAGE #age
health$DMDCITZN [health$DMDCITZN==7] <- NA
health$DMDCITZN [health$DMDCITZN==9] <- NA
cit.sta <- health$DMDCITZN
cit.sta <- na.omit(cit.sta)
summary(cit.sta)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.107 1.000 2.000
health$DMDYRSUS [health$DMDYRSUS==77] <- NA
health$DMDYRSUS [health$DMDYRSUS==99] <- NA
time <- health$DMDYRSUS
time <- na.omit(time)
summary (time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 5.000 4.716 6.000 9.000
# bivariate relationship between the income and race, use boxplot, and describe the finding#
# Mexican American have the lowest medium ratio of family income to poverty, whereas, Non-Hispanic Asian have the highest ratio of family income to poverty #
health %>%
ggplot(aes(x = factor(race), y = income)) +
geom_boxplot()
## Warning: Removed 840 rows containing non-finite values (stat_boxplot).

# bivariate relationship between the income and age, use boxplot, and describe the finding#
# x-axis = age, into groups #
# Age between 43-48 have the highest ratio of family income, whereas, age between 18-26 have the lowest ratio of family income. We also observed that there is highest increase of family income from age 26 to age 28 #
health %>%
ggplot(aes(x = factor(age), y = income)) +
geom_boxplot(mapping = aes(group = cut_number (age, 10)))
## Warning: Removed 840 rows containing non-finite values (stat_boxplot).

# Create a new categorical variable of immigration status by using citizenship status (DMDCITZN) and length of time in US DMDYRSUS #
# new categorical variable should have 3 categories #
# remove missing data for DMDCITZN & DMDYRSUS #
names(health)
## [1] "SEQN" "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
## [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGY" "RIDEXAGM" "DMQMILIZ"
## [13] "DMQADFC" "DMDBORN4" "DMDCITZN" "DMDYRSUS" "DMDEDUC3" "DMDEDUC2"
## [19] "DMDMARTL" "RIDEXPRG" "SIALANG" "SIAPROXY" "SIAINTRP" "FIALANG"
## [25] "FIAPROXY" "FIAINTRP" "MIALANG" "MIAPROXY" "MIAINTRP" "AIALANGA"
## [31] "WTINT2YR" "WTMEC2YR" "SDMVPSU" "SDMVSTRA" "INDHHIN2" "INDFMIN2"
## [37] "INDFMPIR" "DMDHHSIZ" "DMDFMSIZ" "DMDHHSZA" "DMDHHSZB" "DMDHHSZE"
## [43] "DMDHRGND" "DMDHRAGE" "DMDHRBR4" "DMDHREDU" "DMDHRMAR" "DMDHSEDU"
select(health, DMDCITZN, DMDYRSUS)
## # A tibble: 9,756 × 2
## DMDCITZN DMDYRSUS
## <dbl> <dbl>
## 1 1 NA
## 2 1 NA
## 3 1 NA
## 4 1 NA
## 5 1 NA
## 6 1 NA
## 7 1 NA
## 8 1 NA
## 9 1 NA
## 10 1 NA
## # … with 9,746 more rows
health$DMDCITZN [health$DMDCITZN==7] <- NA
health$DMDCITZN [health$DMDCITZN==9] <- NA
cit.sta <- health$DMDCITZN
cit.sta <- na.omit(cit.sta)
summary(cit.sta)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.107 1.000 2.000
health$DMDYRSUS [health$DMDYRSUS==77] <- NA
health$DMDYRSUS [health$DMDYRSUS==99] <- NA
time <- health$DMDYRSUS
time <- na.omit(time)
summary (time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 5.000 4.716 6.000 9.000
select(health, DMDCITZN, DMDYRSUS)
## # A tibble: 9,756 × 2
## DMDCITZN DMDYRSUS
## <dbl> <dbl>
## 1 1 NA
## 2 1 NA
## 3 1 NA
## 4 1 NA
## 5 1 NA
## 6 1 NA
## 7 1 NA
## 8 1 NA
## 9 1 NA
## 10 1 NA
## # … with 9,746 more rows
# the new subset #
health.02 <- select(health, DMDCITZN, DMDYRSUS)
# US citizen #
us.citizen <- filter (health.02, DMDCITZN == 1 & DMDYRSUS >= 1)
# not a US citizen and having been in the US for less than 10 years #
not.citizen.01 <- filter (health.02, DMDCITZN == 2 & DMDYRSUS <= 3)
# not a US citizen and having been in the US for 10 years or longer #
not.citizen.02 <- filter (health.02, DMDCITZN == 2 & DMDYRSUS >= 4)
table (us.citizen)
## DMDYRSUS
## DMDCITZN 1 2 3 4 5 6 7 8 9
## 1 10 30 84 129 115 250 198 110 59
table (us.citizen) %>%
proportions() %>%
round(2)
## DMDYRSUS
## DMDCITZN 1 2 3 4 5 6 7 8 9
## 1 0.01 0.03 0.09 0.13 0.12 0.25 0.20 0.11 0.06
table (not.citizen.01)
## DMDYRSUS
## DMDCITZN 1 2 3
## 2 70 227 218
table (not.citizen.01) %>%
proportions() %>%
round(2)
## DMDYRSUS
## DMDCITZN 1 2 3
## 2 0.14 0.44 0.42
table (not.citizen.02)
## DMDYRSUS
## DMDCITZN 4 5 6 7 8 9
## 2 215 104 110 51 15 2
table (not.citizen.02) %>%
proportions() %>%
round(2)
## DMDYRSUS
## DMDCITZN 4 5 6 7 8 9
## 2 0.43 0.21 0.22 0.10 0.03 0.00
test <- aov( DMDCITZN ~ income, data = health)
summary(test)
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 6.5 6.461 71.24 <2e-16 ***
## Residuals 8902 807.4 0.091
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 852 observations deleted due to missingness