library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tibble  3.1.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(haven)
library(ggplot2)
health <- read_xpt("/Users/yunis/Desktop/NHANES-2011-2012-Demo.xpt")
#first we are loading the xpt file#

nrow(health)
## [1] 9756
ncol(health)
## [1] 48
summary(health$DMDHRAGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   43.00   45.39   56.00   80.00
summary(health$INDFMPIR) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.860   1.630   2.205   3.580   5.000     840
# checking the dataset # 

class(health$DMDHRAGE)
## [1] "numeric"
age <-  health$DMDHRAGE
# creating data frame for person's age in years, this is a numeric data#

class(health$INDFMPIR)
## [1] "numeric"
income <- health$INDFMPIR
# creating data frame for Ratio of family income to poverty, this is a numeric data#
plot (age, income,
     main = "2011-2012 National Health and Nutrition Examination Survey",
     xlab = "age",
     ylab = "family income ",
     col= "pink",
     pch = 8
     )

# this is a base R scatter plot # 
# most people with estimate of age of 20-4's family income ratio scatter around 0-1 #
ggplot(health, aes(x = age, y = income)) +
    geom_point(size = 1.5, color = "#b47dc9") +
    labs( 
    title = "2011-2012 National Health and Nutrition Examination Survey", 
    x = "Age",
    y = "Family Income"
  )
## Warning: Removed 840 rows containing missing values (geom_point).

# this is a ggplot version of scatter plot # 

ggplot(health, aes(x = age, y = income)) +
    geom_point(size = 1.5, aes(color = factor(RIDRETH3))) +
    labs(
    title = "2011-2012 National Health and Nutrition Examination Survey", 
    x = "Age",
    y = "Family Income"
  )
## Warning: Removed 840 rows containing missing values (geom_point).

# this is a ggplot version of scatter plot, with factor RIDRETH3 #
# RIDRETH3 (Non-Hispanic White) and RIDRETH3 (Non-Hispanic White) occupy of the plot, gather around age of 20-4's family income ratio of 0-1  #