library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
health <- read_xpt("/Volumes/NetStorage/Yunis File/Class/Fall' 21/DATA 333/Week-07/NHANES-2011-2012-Demo.xpt")
age <- health$DMDHRAGE
income <- health$INDFMPIR
## Create a Loess smoothed curve to depict the relationship between age and ratio of family income to poverty ##
ggplot (data = health) +
geom_smooth (aes(x = age, y = income))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 840 rows containing non-finite values (stat_smooth).

# Based on the smoothed curve, we can see a positive regression between age and increase family income to poverty up until 47-48 of age. We can also see a slow negative decline starting at age 60.
## Create a Loess smoothed curve to depict the relationship between age and ratio of family income to poverty ##
## in this graph, both group 1 and group 2 have very similar curve.
## group 6 (Non-Hispanic Asian) peak around age 37 or 38, and group 7 (Other Race - Including Multi-Racial) peak around age 45. Group 1 (Mexican American) and group 2 (Other Hispanic) have very similar curve, they both peaked around age 50. group 3 (Non-Hispanic White) have the highest ratio of family income to poverty at age 80 #
ggplot (health, aes(x = age, y = income)) +
geom_smooth (size = 1, aes(linetype = factor(RIDRETH3)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 840 rows containing non-finite values (stat_smooth).

# Visualize the distribution of the race/ethnicity variable named RIDRETH3. What type of graph is appropriate? #
# this scatter plot display too much information#
ggplot (health, aes(x = age, y = income)) +
geom_point (size = 1, aes(color = factor(RIDRETH3)))
## Warning: Removed 840 rows containing missing values (geom_point).

# very similar to geom_point, it's best to set aesthetics in geom_smooth, with different colors for group RIDRETH3 #
ggplot (health, aes(x = age, y = income)) +
geom_smooth (size = 1, aes(color = factor(RIDRETH3)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 840 rows containing non-finite values (stat_smooth).

# Visualize racial/ethnic (variable named RIDRETH3) differences in the distribution of the ratio of family income to poverty (variable named INDFMPIR). What do you find from your graph? #
ggplot(data = health) +
geom_point (mapping = aes (x = RIDRETH3, y = income,
color = factor(RIDRETH3),
shape = factor(RIDRETH3)))
## Warning: Removed 840 rows containing missing values (geom_point).

# In this graph, we can see that ratio of family income to poverty between 0-2.5 are cluster around all race groups.#
ggplot (data = health) +
geom_point (aes(x = RIDRETH3, y = income, size = factor(RIDRETH3)),
color = "pink",
shape = 3)
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 840 rows containing missing values (geom_point).
