Step 1

library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
data<-read.csv("F:/Skills Drill 2 NHIS Practice Data.csv")
head(data)
##   health sex age pooryn
## 1      1   2  33      2
## 2      2   2  52      1
## 3      1   1  41      1
## 4      2   1  67      1
## 5      3   1  25      2
## 6      5   2  61      1

Step 2

data<-data%>%
  mutate(health = ifelse(health==1,"Excellent",
                  ifelse(health==2,"Very Good",
                  ifelse(health==3,"Good",
                  ifelse(health==4,"Fair",
                  ifelse(health==5,"Poor",NA))))))
data<-data%>%
  mutate(pooryn = ifelse(pooryn==1,"above poverty",
                  ifelse(pooryn==2,"below poverty",NA)))

Step 3

data<-data%>%
  mutate(health=factor(health, levels=c("Excellent","Very Good","Good","Fair","Poor")))
prop.table(table(data$health, data$pooryn),2)
##            
##             above poverty below poverty
##   Excellent    0.29431439    0.19476377
##   Very Good    0.33473778    0.23178653
##   Good         0.25821636    0.29396233
##   Fair         0.08790407    0.19573345
##   Poor         0.02482740    0.08375391
People above the poverty line are more likely to be healthy.

Step 3(Visualization)

data%>%
  group_by(pooryn,health)%>%
  summarize(n=n())%>%
  mutate(percent=n/sum(n))%>%
  ggplot()+
  geom_col(aes(x=pooryn, y=percent, fill=health))
## Warning: Factor `health` contains implicit NA, consider using
## `forcats::fct_explicit_na`

Step 4

data%>%
  group_by(health)%>%
  summarize(avg_age=mean(age, na.rm=TRUE))
## Warning: Factor `health` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 2
##   health    avg_age
##   <fct>       <dbl>
## 1 Excellent    41.3
## 2 Very Good    46.0
## 3 Good         50.9
## 4 Fair         56.7
## 5 Poor         59.9
## 6 <NA>         54.7
Based on the table above, older people are more likely to feel unhealthy.

Step 5

data%>%
  ggplot()+geom_histogram(aes(x=age))+
  facet_wrap(~health)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Younger people are likely to feel healthy compared to older people.