library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(haven)
library(dplyr)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
#import NHIS data 
NHIS<-read_dta("NHIS_v2.dta")

Does occupation industry type have an impact on alcohol consumption?

5 Variables

  1. indstrn204
  2. educ
  3. sex
  4. sexorien
  5. marstat

Insights 1.Health Care and Social Assistance Industries (16), Manufacturing Industries (5), Retail Trade Industries (7) are top 3 Industries by quantity (n) 2.There are slightly more females in the NHIS dataset 3. Married is the top marstat in the NHIS dataset

NHISData <- select(NHIS, indstrn204, educ, sex, sexorien, marstat)
head(NHISData)
## # A tibble: 6 × 5
##   indstrn204      educ       sex  sexorien   marstat
##    <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1        NaN        18         1       NaN        50
## 2        NaN         9         2       NaN        11
## 3        NaN        16         2       NaN        50
## 4        NaN        16         2       NaN        50
## 5        NaN        10         1       NaN        11
## 6        NaN        20         2       NaN        11
topindustry <- NHISData %>% 
  select(indstrn204, educ, sex, sexorien, marstat) %>%
  count(indstrn204) %>%
  arrange(desc(n)) 
head(topindustry)
## # A tibble: 6 × 2
##   indstrn204      n
##    <dbl+lbl>  <int>
## 1        NaN 226953
## 2         16  47672
## 3          5  44080
## 4          7  37855
## 5         15  34969
## 6         18  24151

There is a problem with the data because NaN means the data is missing.

#change data structure to not include NA values
  topindustry$indstrn204 <- as.integer(topindustry$indstrn204)

  topindustry <- as.data.table(topindustry)
  topindustry1 <- na.omit(topindustry) 
  head(topindustry1)
##    indstrn204     n
## 1:         16 47672
## 2:          5 44080
## 3:          7 37855
## 4:         15 34969
## 5:         18 24151
## 6:          0 23220

Now, no more missing values.I use R’s na.omit function to remove missing values.

topmarstat <- NHISData %>% 
  select(indstrn204, educ, sex, sexorien, marstat) %>%
  count(marstat) %>%
  arrange(desc(n)) 

topmarstat$marstat<- as.integer(topmarstat$marstat)
as.data.table(topmarstat)
##    marstat      n
## 1:      10 176298
## 2:      50 156790
## 3:      11 105655
## 4:      30  90822
## 5:      20  61975
## 6:      40  21537
## 7:      12   3647
## 8:      99   2806
## 9:      13      7
head(topmarstat)
## # A tibble: 6 × 2
##   marstat      n
##     <int>  <int>
## 1      10 176298
## 2      50 156790
## 3      11 105655
## 4      30  90822
## 5      20  61975
## 6      40  21537

This is great but what does 10 represent?

topmarstat$marstat[topmarstat$marstat==10] <- "Married"
head(topmarstat)
## # A tibble: 6 × 2
##   marstat      n
##     <chr>  <int>
## 1 Married 176298
## 2      50 156790
## 3      11 105655
## 4      30  90822
## 5      20  61975
## 6      40  21537