Step 1
library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
data<-read.csv("F:/Skills Drill 2 NHIS Practice Data.csv")
head(data)
## health sex age pooryn
## 1 1 2 33 2
## 2 2 2 52 1
## 3 1 1 41 1
## 4 2 1 67 1
## 5 3 1 25 2
## 6 5 2 61 1
Step 2
data<-data%>%
mutate(health = ifelse(health==1,"Excellent",
ifelse(health==2,"Very Good",
ifelse(health==3,"Good",
ifelse(health==4,"Fair",
ifelse(health==5,"Poor",NA))))))
data<-data%>%
mutate(pooryn = ifelse(pooryn==1,"above poverty",
ifelse(pooryn==2,"below poverty",NA)))
Step 3
data<-data%>%
mutate(health=factor(health, levels=c("Excellent","Very Good","Good","Fair","Poor")))
prop.table(table(data$health, data$pooryn),2)
##
## above poverty below poverty
## Excellent 0.29431439 0.19476377
## Very Good 0.33473778 0.23178653
## Good 0.25821636 0.29396233
## Fair 0.08790407 0.19573345
## Poor 0.02482740 0.08375391
People above the poverty line are more likely to be healthy.
Step 3(Visualization)
data%>%
group_by(pooryn,health)%>%
summarize(n=n())%>%
mutate(percent=n/sum(n))%>%
ggplot()+
geom_col(aes(x=pooryn, y=percent, fill=health))
## Warning: Factor `health` contains implicit NA, consider using
## `forcats::fct_explicit_na`

Step 4
data%>%
group_by(health)%>%
summarize(avg_age=mean(age, na.rm=TRUE))
## Warning: Factor `health` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 2
## health avg_age
## <fct> <dbl>
## 1 Excellent 41.3
## 2 Very Good 46.0
## 3 Good 50.9
## 4 Fair 56.7
## 5 Poor 59.9
## 6 <NA> 54.7
Based on the table above, older people are more likely to feel unhealthy.
Step 5
data%>%
ggplot()+geom_histogram(aes(x=age))+
facet_wrap(~health)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Younger people are likely to feel healthy compared to older people.