library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(haven)
library(dplyr)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
#import NHIS data
NHIS<-read_dta("NHIS_v2.dta")
Does occupation industry type have an impact on alcohol consumption?
5 Variables
Insights 1.Health Care and Social Assistance Industries (16), Manufacturing Industries (5), Retail Trade Industries (7) are top 3 Industries by quantity (n) 2.There are slightly more females in the NHIS dataset 3. Married is the top marstat in the NHIS dataset
NHISData <- select(NHIS, indstrn204, educ, sex, sexorien, marstat)
head(NHISData)
## # A tibble: 6 × 5
## indstrn204 educ sex sexorien marstat
## <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 NaN 18 1 NaN 50
## 2 NaN 9 2 NaN 11
## 3 NaN 16 2 NaN 50
## 4 NaN 16 2 NaN 50
## 5 NaN 10 1 NaN 11
## 6 NaN 20 2 NaN 11
topindustry <- NHISData %>%
select(indstrn204, educ, sex, sexorien, marstat) %>%
count(indstrn204) %>%
arrange(desc(n))
head(topindustry)
## # A tibble: 6 × 2
## indstrn204 n
## <dbl+lbl> <int>
## 1 NaN 226953
## 2 16 47672
## 3 5 44080
## 4 7 37855
## 5 15 34969
## 6 18 24151
There is a problem with the data because NaN means the data is missing.
#change data structure to not include NA values
topindustry$indstrn204 <- as.integer(topindustry$indstrn204)
topindustry <- as.data.table(topindustry)
topindustry1 <- na.omit(topindustry)
head(topindustry1)
## indstrn204 n
## 1: 16 47672
## 2: 5 44080
## 3: 7 37855
## 4: 15 34969
## 5: 18 24151
## 6: 0 23220
Now, no more missing values.I use R’s na.omit function to remove missing values.
topmarstat <- NHISData %>%
select(indstrn204, educ, sex, sexorien, marstat) %>%
count(marstat) %>%
arrange(desc(n))
topmarstat$marstat<- as.integer(topmarstat$marstat)
as.data.table(topmarstat)
## marstat n
## 1: 10 176298
## 2: 50 156790
## 3: 11 105655
## 4: 30 90822
## 5: 20 61975
## 6: 40 21537
## 7: 12 3647
## 8: 99 2806
## 9: 13 7
head(topmarstat)
## # A tibble: 6 × 2
## marstat n
## <int> <int>
## 1 10 176298
## 2 50 156790
## 3 11 105655
## 4 30 90822
## 5 20 61975
## 6 40 21537
This is great but what does 10 represent?
topmarstat$marstat[topmarstat$marstat==10] <- "Married"
head(topmarstat)
## # A tibble: 6 × 2
## marstat n
## <chr> <int>
## 1 Married 176298
## 2 50 156790
## 3 11 105655
## 4 30 90822
## 5 20 61975
## 6 40 21537