library(foreign)
library(gtools)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(questionr)
## Warning: package 'questionr' was built under R version 4.0.4
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(DataExplorer)

In order to read xpt file, we use the foreign package

library(foreign)
BRFSS_Raw <- read.xport("LLCP2018.XPT")

The raw data contains 437436 rows and 275 columns.

We only keep variables that we need.

The data description is as follow

Record Identification (A)

A1. IYEAR: Interview Year

A2. IMONTH: Interview Month

A3. IDAY: Interview Day

A4. X_STATE: State FIPS Code

Demographics (B) B1. X_AGEG5YR : Reported age in five-year age categories calculated variable

B2. SEX1: Respondents Sex

B3. X_RACE: race/ethnicity value

B4. MARITAL : Marital Status

B5. EDUCA : Education Level

B6. EMPLOY1 : Employment Status

B7. INCOME1 : Income Level

B8. RENTHOM1 : Do you own or rent your home?

Urban status (C)

C1. X_METSTAT: Metropolitan Status

C2. X_URBSTAT: Urban/Rural Status

Lifestyle (D)

D1. X_SMOKER3 Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker

D2. ALCDAY5: Days in past 30 had alcoholic beverage

D3. SLEPTIM1: How Much Time Do You Sleep

D4. EXERANY2: Exercise in Past 30 Days

Health Status (E)

E1. GENHLTH :Would you say that in general your health is:

Healthy Days (F)— Health Related Quality of Life

F1. PHYSHLTH : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?

F2. MENTHLTH : Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?

F3. POORHLTH : During the past 30 days, for about how many days did poor physical or mental health keep you from doing your usual activities, such as self-care, work, or recreation?

Health Care Access (G)

G1. HLTHPLN1 : Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare, or Indian Health Service?)

G2. MEDCOST : Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?

G3. CHECKUP1 : About how long has it been since you last visited a doctor for a routine checkup? [A routine checkup is a general physical exam, not an exam for a specific injury, illness, or condition.]

Chronic Health Conditions (H) H1. ASTHMA3 : Ever Told Had Asthma

H2. CHCCOPD1: (Ever told) you have chronic obstructive pulmonary disease, emphysema or chronic bronchitis?

H3. HAVARTH3 : (Ever told) you have some form of arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia? (Arthritis diagnoses include: rheumatism, polymyalgia rheumatica; osteoarthritis (not osteporosis); tendonitis, bursitis, bunion, tennis elbow; carpal tunnel syndrome, tarsal tunnel syndrome; joint infection, etc.)

H4. ADDEPEV2 (Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

H5. CHCKDNY1 : (Ever told) you have kidney disease? (Do NOT include kidney stones, bladder infection or incontinence.)

H6. DIABETE3: (Ever told) you have diabetes (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´. If Respondent says pre-diabetes or borderline diabetes, use response code 4.)

H7. CHCSCNCR : (Ever told) you had skin cancer?

H8. CHCOCNCR : (Ever told) you had any other types of cancer?

H9. CVDSTRK3 : (Ever told) you had a stroke.

Oral Health (I)

I1. LASTDEN4 : Including all types of dentists, such as orthodontists, oral surgeons, and all other dental specialists, as well as dental hygienists, how long has it been since you last visited a dentist or a dental clinic for any reason?

I2. RMVTETH4 : Not including teeth lost for injury or orthodontics, how many of your permanent teeth have been removed because of tooth decay or gum disease?

Obesity (J) J1. X_BMI5CAT : Four-categories of Body Mass Index (BMI)

MICHD (Outcome) (K) K1. X_MICHD : Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI)

Defining object list of variables to be kept

BRFSS_VarList <- c("X_STATE", "IMONTH", "IDAY", "IYEAR", # Record Identification
                  "X_AGEG5YR","SEX1", "X_RACE", "MARITAL", "EDUCA", "INCOME2", "EMPLOY1", "RENTHOM1",# Demographics
                  "X_METSTAT","X_URBSTAT", # Urban Rural , Metropolitan, Non-metropolitan
                  "ALCDAY5", "X_SMOKER3", "EXERANY2", "SLEPTIM1", # Lifestyle
                  "GENHLTH", # Health Status
                  "MENTHLTH", "PHYSHLTH", "POORHLTH", # Healthy Days - HRQOL
                  "HLTHPLN1", "CHECKUP1", "MEDCOST", # Health Care Access
                  "RMVTETH4", "LASTDEN4", # Oral Health
                  "CVDSTRK3","DIABETE3", "HAVARTH3", "ASTHMA3", "CHCCOPD1", "ADDEPEV2", "CHCSCNCR", "CHCOCNCR", "CHCKDNY1", # Chronic Health Conditions
                  "X_BMI5CAT",# Obesity
                  "X_MICHD")# the outcome

Subseting by varlist

BRFSS_selected <- BRFSS_Raw[BRFSS_VarList]
str(BRFSS_selected)
## 'data.frame':    437436 obs. of  38 variables:
##  $ X_STATE  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ IMONTH   : chr  "01" "01" "01" "01" ...
##  $ IDAY     : chr  "05" "12" "08" "03" ...
##  $ IYEAR    : chr  "2018" "2018" "2018" "2018" ...
##  $ X_AGEG5YR: num  13 3 12 10 5 13 12 6 8 6 ...
##  $ SEX1     : num  2 2 2 1 2 2 2 2 1 2 ...
##  $ X_RACE   : num  1 2 1 1 1 1 1 1 1 2 ...
##  $ MARITAL  : num  3 5 3 2 1 3 1 1 1 2 ...
##  $ EDUCA    : num  6 6 4 4 6 5 6 6 4 6 ...
##  $ INCOME2  : num  6 4 3 3 99 99 8 99 8 8 ...
##  $ EMPLOY1  : num  2 1 7 7 1 7 5 5 1 1 ...
##  $ RENTHOM1 : num  1 2 1 1 1 1 1 1 1 2 ...
##  $ X_METSTAT: num  2 1 2 1 2 1 1 1 2 1 ...
##  $ X_URBSTAT: num  2 1 1 1 1 1 1 1 2 1 ...
##  $ ALCDAY5  : num  888 202 888 888 888 888 888 201 101 888 ...
##  $ X_SMOKER3: num  4 1 4 4 4 4 4 4 3 4 ...
##  $ EXERANY2 : num  2 1 1 1 2 2 1 1 2 1 ...
##  $ SLEPTIM1 : num  7 5 7 6 7 6 8 7 6 7 ...
##  $ GENHLTH  : num  2 3 5 1 2 2 1 2 2 3 ...
##  $ MENTHLTH : num  88 88 88 88 88 88 88 88 88 88 ...
##  $ PHYSHLTH : num  30 88 10 88 88 88 88 88 88 5 ...
##  $ POORHLTH : num  30 NA 88 NA NA NA NA NA NA 88 ...
##  $ HLTHPLN1 : num  1 2 1 1 1 1 1 1 1 1 ...
##  $ CHECKUP1 : num  1 2 1 1 1 1 1 1 1 1 ...
##  $ MEDCOST  : num  2 1 2 2 2 2 2 2 2 2 ...
##  $ RMVTETH4 : num  1 8 7 1 8 7 8 8 8 8 ...
##  $ LASTDEN4 : num  1 2 1 3 1 1 1 1 1 1 ...
##  $ CVDSTRK3 : num  2 2 1 2 2 2 2 2 2 2 ...
##  $ DIABETE3 : num  3 3 1 3 3 4 3 3 1 3 ...
##  $ HAVARTH3 : num  1 2 2 2 2 2 2 2 2 2 ...
##  $ ASTHMA3  : num  2 2 2 2 2 1 2 2 2 2 ...
##  $ CHCCOPD1 : num  2 2 2 2 2 2 2 2 2 1 ...
##  $ ADDEPEV2 : num  2 2 1 2 2 2 2 2 2 2 ...
##  $ CHCSCNCR : num  1 2 2 2 2 2 2 2 2 2 ...
##  $ CHCOCNCR : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ CHCKDNY1 : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ X_BMI5CAT: num  2 4 3 3 NA 4 2 2 3 3 ...
##  $ X_MICHD  : num  2 2 2 2 2 1 2 2 2 2 ...

The selected data has 38 columns.

This step is mainly for dealing with recoding, missing value, and outliers.

library(DataExplorer)
introduce(BRFSS_selected)
##     rows columns discrete_columns continuous_columns all_missing_columns
## 1 437436      38                3                 35                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1               281851        204754           16622568    132991392

There is 16622568 observations and 281851 missing values.

There are only 204755 complete rows.

plot_intro(BRFSS_selected)

To view missing value distribution for data

plot_missing(BRFSS_selected)

47% of rows in the column POORHLTH is missing.

library(pander)
## Warning: package 'pander' was built under R version 4.0.5
library(forcats)

Record Identification (A) A1. IYEAR

A2. IMONTH

A3. IDAY

A4. X_STATE

A1. IYEAR

BRFSS_selected  %>% 
  group_by(IYEAR) %>% 
  count() 
## # A tibble: 2 x 2
## # Groups:   IYEAR [2]
##   IYEAR      n
##   <chr>  <int>
## 1 2018  418474
## 2 2019   18962

There are 18962 interviews in 2019. We are only interested in interviews in 2018.

There for, all the 2019 records will be eleminated.

BRFSS_A1 <- BRFSS_selected %>% 
  filter(IYEAR == 2018)
BRFSS_A1 <- subset(BRFSS_A1, select = -c(IYEAR))

A2-IMONTH

BRFSS_A1 %>% 
  group_by(IMONTH) %>% 
  count() %>% 
  pander()
IMONTH n
01 15066
02 36716
03 39268
04 35296
05 33657
06 35775
07 37295
08 35999
09 35961
10 40429
11 37420
12 35592

Note: Less interviews were done in January. Recoding

BRFSS_A2 <- BRFSS_A1 %>% 
  mutate(MONTH = case_when(
    IMONTH == "01" ~ "Jan",
    IMONTH == "02" ~ "Fev",
    IMONTH == "03" ~ "Mar",
    IMONTH == "04" ~ "Apr",
    IMONTH == "05" ~ "May",
    IMONTH == "06" ~ "June",                
    IMONTH == "07" ~ "July",
    IMONTH == "08" ~ "Aug",
    IMONTH == "09" ~ "Sept",
    IMONTH == "10" ~ "Oct",
    IMONTH == "11" ~ "Nov",
    IMONTH == "12" ~ "Dec"))

Checking

BRFSS_A2 %>% 
  group_by(MONTH) %>%
  count() %>% 
  pander()
MONTH n
Apr 35296
Aug 35999
Dec 35592
Fev 36716
Jan 15066
July 37295
June 35775
Mar 39268
May 33657
Nov 37420
Oct 40429
Sept 35961

The result appears in the alphabetic order.

Now, we have to reorder these levels.

BRFSS_A2$MONTH <- factor(BRFSS_A2$MONTH,
                      levels = c("Jan", "Fev", "Mar", "Apr", 
                                 "May", "June", "July", "Aug",
                                 "Sept", "Oct", "Nov", "Dec"), ordered = TRUE)
BRFSS_A2 %>% 
  group_by(MONTH) %>%
  count() %>% 
  pander()
MONTH n
Jan 15066
Fev 36716
Mar 39268
Apr 35296
May 33657
June 35775
July 37295
Aug 35999
Sept 35961
Oct 40429
Nov 37420
Dec 35592
BRFSS_A2 <- subset(BRFSS_A2, select = -c(IMONTH))

A3. IDAY

BRFSS_A2 %>% 
  group_by(IDAY) %>% 
  count() %>% 
  pander()
IDAY n
01 11548
02 12494
03 13443
04 12836
05 15427
06 16713
07 16674
08 16012
09 15461
10 15954
11 15153
12 16435
13 15826
14 15602
15 15305
16 14819
17 14992
18 14583
19 15149
20 14379
21 13002
22 12487
23 12591
24 11641
25 11157
26 12414
27 11684
28 10370
29 9955
30 9408
31 4960
BRFSS_A3 <- BRFSS_A2

Demographics (B)

B1. X_AGEG5YR

B2. SEX1 Respondents Sex

B3. X_RACE

B4. MARITAL Marital Status

B5. EDUCA Education Level

B6. EMPLOY1 Employment Status

B7. INCOME1 Income Level

B1. X_AGEG5YR —> AGE

BRFSS_A3%>% 
  group_by(X_AGEG5YR) %>% 
  count() %>% 
  pander()
X_AGEG5YR n
1 24776
2 21237
3 23178
4 24865
5 24330
6 27446
7 33444
8 40294
9 45005
10 45300
11 39298
12 27650
13 33522
14 8129

There are 14 categories (Please look at the data dictionary for the description of each value/label). The label 14 is Don’t know/Refused/Missing (1.94%).

We will regroup the first 13 rows into 7 groups.

BRFSS_B1 <- BRFSS_A2 %>% 
  filter(X_AGEG5YR < 14) %>% 
  mutate(AGE = case_when(
    X_AGEG5YR == 1 ~ "18-29",
    X_AGEG5YR == 2 ~ "18-29",
    X_AGEG5YR == 3 ~ "30-39",
    X_AGEG5YR == 4 ~ "30-39",
    X_AGEG5YR == 5 ~ "40-49",
    X_AGEG5YR == 6 ~ "40-49",                
    X_AGEG5YR == 7 ~ "50-59",
    X_AGEG5YR == 8 ~ "50-59",
    X_AGEG5YR == 9 ~ "60-69",
    X_AGEG5YR == 10 ~ "60-69",
    X_AGEG5YR == 11 ~ "70-79",
    X_AGEG5YR == 12 ~ "70-79",
    X_AGEG5YR == 13 ~ "80+"))

Note: Function recode does not work

BRFSS_B1 %>% 
  group_by(AGE) %>% 
  count() %>% 
  pander()
AGE n
18-29 46013
30-39 48043
40-49 51776
50-59 73738
60-69 90305
70-79 66948
80+ 33522

Removing the column X_AGEG5YR

BRFSS_B1 <- subset(BRFSS_B1, select = -c(X_AGEG5YR))

B2. SEX1 —> GENDER

BRFSS_B1 %>% 
  group_by(SEX1) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   SEX1 [4]
##    SEX1      n
##   <dbl>  <int>
## 1     1 185889
## 2     2 223701
## 3     7    391
## 4     9    364

For the SEX attribute, we exclude any values of 7 (Don’t know/Not Sure, 0.093% of data) and 9 (refused, 0.089 % of data). Also, we change the column name into GENDER. (401/428964 = 0.093)

BRFSS_B2 <- BRFSS_B1 %>% 
  filter(SEX1 == 1 | SEX1 == 2) %>% 
  mutate(GENDER = case_when(
    SEX1 == 1 ~ "Male",
    SEX1 == 2 ~ "Female"))

Checking

BRFSS_B2 %>% 
  group_by(GENDER) %>% 
  count() %>% 
  pander()
GENDER n
Female 223701
Male 185889
BRFSS_B2 <- subset(BRFSS_B2, select = -c(SEX1))

B3. X_RACE —> RACE

BRFSS_B2 %>% 
  group_by(X_RACE) %>% 
  count() %>% 
  pander()
X_RACE n
1 307499
2 32980
3 7138
4 9233
5 1947
6 2940
7 8064
8 33378
9 6410
NA 1
BRFSS_B3 <- BRFSS_B2 %>% 
  filter(X_RACE < 9 ) %>% 
  mutate(RACE = case_when(
    X_RACE == 1 ~ "White",
    X_RACE == 2 ~ "Black",
    X_RACE == 3 ~ "AIAN",
    X_RACE == 4 ~ "Asian",
    X_RACE == 5 ~ "Others",
    X_RACE == 6 ~ "Others",                
    X_RACE == 7 ~ "Others",
    X_RACE == 8 ~ "Hispanic"))
BRFSS_B3 %>% 
  group_by(RACE) %>% 
  count() %>% 
  pander()
RACE n
AIAN 7138
Asian 9233
Black 32980
Hispanic 33378
Others 12951
White 307499
BRFSS_B3$RACE <- factor(BRFSS_B3$RACE,
                      levels = c("AIAN",
                                 "Asian",
                                 "Black",
                                 "Hispanic",
                                 "White",
                                 "Others"), ordered = TRUE)
BRFSS_B3  %>% 
  group_by(RACE) %>% 
  count() %>% 
  pander()
RACE n
AIAN 7138
Asian 9233
Black 32980
Hispanic 33378
White 307499
Others 12951
BRFSS_B3 <- subset(BRFSS_B3, select = -c(X_RACE))

B4. MARITAL attribute

BRFSS_B3 %>% 
  group_by(MARITAL) %>% 
  count() %>% 
  pander()
MARITAL n
1 206746
2 55641
3 47879
4 8720
5 68716
6 13848
9 1627
NA 2
BRFSS_B4 <- BRFSS_B3 %>% 
  filter(MARITAL < 9) %>% 
  mutate(
    RELP = case_when(
     MARITAL == 1 ~ "In relationship",
     MARITAL == 6 ~ "In relationship",
     MARITAL == 2 ~ "Living apart",
     MARITAL == 4 ~ "Living apart",
     MARITAL == 3 ~ "Widowed",
     MARITAL == 5 ~ "Never married")
    )

Checking

BRFSS_B4 %>% 
  group_by(RELP) %>% 
  count() %>% 
  pander()
RELP n
In relationship 220594
Living apart 64361
Never married 68716
Widowed 47879
BRFSS_B4$RELP <- factor(BRFSS_B4$RELP,
                      levels = c("In relationship",
                                 "Living apart",
                                 "Never married",
                                 "Widowed"), ordered = TRUE)
BRFSS_B4 %>% 
  group_by(RELP) %>% 
  count() %>% 
  pander()
RELP n
In relationship 220594
Living apart 64361
Never married 68716
Widowed 47879
BRFSS_B4 <- subset(BRFSS_B4, select = -c(MARITAL))

B5. EDUCA attribute ===> EDU

BRFSS_B4 %>% 
  group_by(EDUCA) %>% 
  count()
## # A tibble: 7 x 2
## # Groups:   EDUCA [7]
##   EDUCA      n
##   <dbl>  <int>
## 1     1    587
## 2     2   9483
## 3     3  19772
## 4     4 110163
## 5     5 110733
## 6     6 150154
## 7     9    658
BRFSS_B5 <- BRFSS_B4 %>%
  filter(EDUCA < 9) %>% 
  mutate(EDU = case_when(
           EDUCA == 1 ~ "<8",
           EDUCA == 2 ~ "<8",
           EDUCA == 3 ~ "9-12",
           EDUCA == 4 ~ "9-12",
           EDUCA == 5 ~ "12-15",
           EDUCA == 6 ~ "16+"))
BRFSS_B5 %>% 
  group_by(EDU) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   EDU [4]
##   EDU        n
##   <chr>  <int>
## 1 <8     10070
## 2 12-15 110733
## 3 16+   150154
## 4 9-12  129935
BRFSS_B5$EDU <- factor(BRFSS_B5$EDU,
                      levels = c("<8",
                                 "9-12",
                                 "12-15",
                                 "16+"), ordered = TRUE)
BRFSS_B5 %>% 
  group_by(EDU) %>% 
  count() %>% 
  pander()
EDU n
<8 10070
9-12 129935
12-15 110733
16+ 150154
BRFSS_B5 <- subset(BRFSS_B5, select = -c(EDUCA))

B6. EMPLOY1 ===> EMPL

Looking at Employment and age.

BRFSS_B5 %>% 
  group_by(EMPLOY1) %>% 
  count()
## # A tibble: 10 x 2
## # Groups:   EMPLOY1 [10]
##    EMPLOY1      n
##      <dbl>  <int>
##  1       1 165540
##  2       2  36094
##  3       3   7455
##  4       4   7783
##  5       5  19506
##  6       6  10717
##  7       7 120524
##  8       8  29906
##  9       9   2380
## 10      NA    987

The category Refused (9) represente 2380/400892 = 0.59% of the total number of respondents

BRFSS_B6 <- BRFSS_B5 %>% 
  filter(EMPLOY1 < 9) %>% 
  mutate(EMPL = case_when(
    EMPLOY1 == 1 ~ "Employed",
    EMPLOY1 == 2 ~ "Employed",
    EMPLOY1 == 3 ~ "Unemployed",
    EMPLOY1 == 4 ~ "Unemployed",
    EMPLOY1 == 5 ~ "Occupations",
    EMPLOY1 == 6 ~ "Occupations",
    EMPLOY1 == 7 ~ "Retired",
    EMPLOY1 == 8 ~ "Unemployed"))

occupation: studen and homemakers

Checking

BRFSS_B6 %>% 
  group_by(EMPL) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   EMPL [4]
##   EMPL             n
##   <chr>        <int>
## 1 Employed    201634
## 2 Occupations  30223
## 3 Retired     120524
## 4 Unemployed   45144
BRFSS_B6$EMPL <- factor(BRFSS_B6$EMPL,
                      levels = c("Employed",
                                 "Unemployed",
                                 "Retired",
                                 "Occupations"), ordered = TRUE)
BRFSS_B6 %>% 
  group_by(EMPL) %>% 
  count() %>% 
  pander()
EMPL n
Employed 201634
Unemployed 45144
Retired 120524
Occupations 30223
BRFSS_B6 <- subset(BRFSS_B6, select = -c(EMPLOY1))

B7. INCOME2 attribute

BRFSS_B6 %>% 
  group_by(INCOME2) %>% 
  count()
## # A tibble: 11 x 2
## # Groups:   INCOME2 [11]
##    INCOME2      n
##      <dbl>  <int>
##  1       1  14954
##  2       2  16579
##  3       3  23764
##  4       4  29677
##  5       5  35224
##  6       6  46109
##  7       7  54438
##  8       8 114372
##  9      77  29830
## 10      99  30016
## 11      NA   2562

77 - Don’t know/Not sure

99 - Refused

BRFSS_B7 <- BRFSS_B6 %>%
  filter(INCOME2 < 77) %>% 
  mutate(INCOME = case_when(
    INCOME2 == 1 ~ "<$20K",
    INCOME2 == 2 ~ "<$20K",
    INCOME2 == 3 ~ "<$20K",
    INCOME2 == 4 ~ "$20K-<$35K",
    INCOME2 == 5 ~ "$20K-<$35K",
    INCOME2 == 6 ~ "$35K-<$50K",
    INCOME2 == 7 ~ "$50K-<$75K",
    INCOME2 == 8 ~ "$75K+"))
BRFSS_B7 %>% 
  group_by(INCOME) %>% 
  count() %>% 
  pander()
INCOME n
$20K-<$35K 64901
$35K-<$50K 46109
$50K-<$75K 54438
$75K+ 114372
<$20K 55297
BRFSS_B7$INCOME <- factor(BRFSS_B7$INCOME,
                      levels = c("<$20K",
                                 "$20K-<$35K",
                                 "$35K-<$50K",
                                 "$50K-<$75K",
                                 "$75K+"), ordered = TRUE)
BRFSS_B7 %>% 
  group_by(INCOME) %>% 
  count() %>% 
  pander()
INCOME n
<$20K 55297
$20K-<$35K 64901
$35K-<$50K 46109
$50K-<$75K 54438
$75K+ 114372
BRFSS_B7 <- subset(BRFSS_B7, select = -c(INCOME2))

B8. RENTHOM1 ==> RENTHOME

Do you own or rent your home?

BRFSS_B7 %>% 
  group_by(RENTHOM1) %>% 
  count() %>% 
  pander()
RENTHOM1 n
1 238586
2 82289
3 13595
7 432
9 215
BRFSS_B8 <- BRFSS_B7 %>%
  filter(RENTHOM1 <= 3) %>% 
  mutate(RENTHOME = case_when(
    RENTHOM1 == 1 ~ "Own",
    RENTHOM1 == 2 ~ "Rent",
    RENTHOM1 == 3 ~ "Other"))
BRFSS_B8 %>% 
  group_by(RENTHOME) %>% 
  count() %>% 
  pander()
RENTHOME n
Other 13595
Own 238586
Rent 82289
BRFSS_B8 <- subset(BRFSS_B8, select = -c(RENTHOM1))

Urban Rural (C)

**C1. _METSTAT**

C2. X_URBSTAT —> Community

BRFSS_B8 %>% 
  group_by(X_METSTAT) %>% 
  count()
## # A tibble: 3 x 2
## # Groups:   X_METSTAT [3]
##   X_METSTAT      n
##       <dbl>  <int>
## 1         1 226904
## 2         2 103903
## 3        NA   3663

3663 records: Not define or missing

BRFSS_B8 %>% 
  group_by(X_URBSTAT) %>% 
  count()
## # A tibble: 3 x 2
## # Groups:   X_URBSTAT [3]
##   X_URBSTAT      n
##       <dbl>  <int>
## 1         1 280125
## 2         2  50682
## 3        NA   3663
BRFSS_C1 <- BRFSS_B8 %>% 
  filter(X_METSTAT == 1 | X_METSTAT == 2|X_URBSTAT== 1 | X_URBSTAT == 2) %>% 
  mutate(COMMUN = case_when(
    X_METSTAT == 1 & X_URBSTAT == 1 ~ "Urban",
    X_METSTAT == 2 & X_URBSTAT == 1 ~ "Suburban",
    X_URBSTAT == 2 ~ "Rural" ))
BRFSS_C1 %>% 
  group_by(COMMUN) %>% 
  count() %>% 
  pander()
COMMUN n
Rural 50682
Suburban 53221
Urban 226904
BRFSS_C1 <- subset(BRFSS_C1, select = -c(X_METSTAT, X_URBSTAT))

D.Lifestyle

D1. X_SMOKER3 Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker

D2. ALCDAY5 Days in past 30 had alcoholic beverage

D3. SLEPTIM1 How Much Time Do You Sleep

D4. EXERANY2 Exercise in Past 30 Days

D1. X_SMOKER3 ===> SMOKESTAT

BRFSS_C1 %>% 
 group_by(X_SMOKER3) %>% 
  count()
## # A tibble: 5 x 2
## # Groups:   X_SMOKER3 [5]
##   X_SMOKER3      n
##       <dbl>  <int>
## 1         1  34756
## 2         2  13898
## 3         3  93590
## 4         4 181708
## 5         9   6855
BRFSS_D1 <- BRFSS_C1 %>% 
  filter(X_SMOKER3 < 9) %>% 
  mutate(SMOKESTAT = case_when(
    X_SMOKER3 == 1 ~ "Daily smoker",
    X_SMOKER3 == 2 ~ "Someday smoker",
    X_SMOKER3 == 3 ~ "Former smoker",
    X_SMOKER3 == 4 ~ "Never smoker"
  ))
BRFSS_D1 %>% 
  group_by(SMOKESTAT) %>% 
  count() %>% 
  pander()
SMOKESTAT n
Daily smoker 34756
Former smoker 93590
Never smoker 181708
Someday smoker 13898

Columns need to be reordered

BRFSS_D1$SMOKESTAT <- factor(BRFSS_D1$SMOKESTAT,
                      levels = c("Daily smoker",
                                 "Someday smoker",
                                 "Former smoker",
                                 "Never smoker"), ordered = TRUE)
BRFSS_D1 %>% 
  group_by(SMOKESTAT) %>% 
  count() %>% 
  pander()
SMOKESTAT n
Daily smoker 34756
Someday smoker 13898
Former smoker 93590
Never smoker 181708
BRFSS_D1 <- subset(BRFSS_D1, select = -c(X_SMOKER3))

D2. ALCDAY5

Should we create only two levels: drinker and non-drinker ? weakly drinker

How many drink per week you have

BRFSS_D1 %>% 
  group_by(ALCDAY5) %>% 
  count() %>% 
  pander()
ALCDAY5 n
101 14082
102 11485
103 8158
104 3928
105 3747
106 1256
107 6197
201 25700
202 19546
203 11411
204 9555
205 9115
206 3531
207 2468
208 2810
209 217
210 7438
211 46
212 1536
213 87
214 472
215 5866
216 247
217 71
218 207
219 11
220 5975
221 184
222 137
223 55
224 196
225 2603
226 134
227 207
228 723
229 290
230 13156
777 2170
888 146951
999 584
NA 1400

Collect only monthly drinker

888 : no drink

BRFSS_D2 <- BRFSS_D1 %>% 
  filter(ALCDAY5 < 777 | ALCDAY5 == 888) %>% 
  mutate(ALC = case_when(
    ALCDAY5 == 888 ~ 200,
    ALCDAY5 == 101 ~ 204,
    ALCDAY5 == 102 ~ 208,
    ALCDAY5 == 103 ~ 212,
    ALCDAY5 == 104 ~ 216,
    ALCDAY5 == 105 ~ 220,
    ALCDAY5 == 106 ~ 224,
    ALCDAY5 == 107 ~ 228,
    ALCDAY5 == 201 ~ 201,
    ALCDAY5 == 202 ~ 202,
    ALCDAY5 == 203 ~ 203,
    ALCDAY5 == 204 ~ 204,
    ALCDAY5 == 205 ~ 205,
    ALCDAY5 == 206 ~ 206,
    ALCDAY5 == 207 ~ 207,
    ALCDAY5 == 208 ~ 208,
    ALCDAY5 == 209 ~ 209,
    ALCDAY5 == 210 ~ 210,
    ALCDAY5 == 211 ~ 211,
    ALCDAY5 == 212 ~ 212,
    ALCDAY5 == 213 ~ 213,
    ALCDAY5 == 214 ~ 214,
    ALCDAY5 == 215 ~ 215,
    ALCDAY5 == 216 ~ 216,
    ALCDAY5 == 217 ~ 207,
    ALCDAY5 == 218 ~ 218,
    ALCDAY5 == 219 ~ 219,
    ALCDAY5 == 220 ~ 220,
    ALCDAY5 == 221 ~ 221,
    ALCDAY5 == 222 ~ 222,
    ALCDAY5 == 223 ~ 223,
    ALCDAY5 == 224 ~ 224,
    ALCDAY5 == 225 ~ 225,
    ALCDAY5 == 226 ~ 226,
    ALCDAY5 == 227 ~ 227,
    ALCDAY5 == 228 ~ 228,
    ALCDAY5 == 229 ~ 229,
    ALCDAY5 == 230 ~ 230))
BRFSS_D2 %>%
  group_by(ALC) %>% 
  count()
## # A tibble: 30 x 2
## # Groups:   ALC [30]
##      ALC      n
##    <dbl>  <int>
##  1   200 146951
##  2   201  25700
##  3   202  19546
##  4   203  11411
##  5   204  23637
##  6   205   9115
##  7   206   3531
##  8   207   2539
##  9   208  14295
## 10   209    217
## # ... with 20 more rows

Now we need to subtract the value of the whole column by 200

BRFSS_D2a <- BRFSS_D2 %>% 
  mutate(ALCMONTH = ALC - 200)

BRFSS_D2a %>%
  group_by(ALCMONTH) %>% 
  count()
## # A tibble: 30 x 2
## # Groups:   ALCMONTH [30]
##    ALCMONTH      n
##       <dbl>  <int>
##  1        0 146951
##  2        1  25700
##  3        2  19546
##  4        3  11411
##  5        4  23637
##  6        5   9115
##  7        6   3531
##  8        7   2539
##  9        8  14295
## 10        9    217
## # ... with 20 more rows
ggplot(data = BRFSS_D2a, aes(ALCMONTH)) + 
  geom_bar(stat= "count", 
                 fill= "blue", 
                 alpha = .75) + 
  coord_cartesian(clip = "off") +
  labs( x = "Number of days",
        y = "Frequency",
        subtitle = "How many days per month did you have at least one drink during the past 30 days?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 150000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 16),
       axis.title = element_text(size = 12,face="bold"),
       plot.title = element_text(color = "blue", size = 16, face = "bold"),
       plot.subtitle = element_text(color = "navyblue", size = 12),
       plot.caption = element_text(color = "grey",size = 16, face = "italic")
)

BRFSS_D2b <-  BRFSS_D2a %>% 
  mutate(ALCpa30 = if_else(ALCMONTH == 0, "0", 
                           if_else(ALCMONTH >= 1 & ALCMONTH < 10, "1-9","10-30")))
BRFSS_D2b %>% 
  group_by(ALCpa30) %>% 
  count() %>% 
  pander()
ALCpa30 n
0 146951
1-9 109991
10-30 62856
BRFSS_D2b <- subset(BRFSS_D2b, select = -c(ALCDAY5, ALC, ALCMONTH))

D3. SLEPTIM1 (Numeric variable)

On average, how many hours of sleep do you get in a 24-hour period?

BRFSS_D2b %>% 
  group_by(SLEPTIM1) %>% 
  count()
## # A tibble: 26 x 2
## # Groups:   SLEPTIM1 [26]
##    SLEPTIM1     n
##       <dbl> <int>
##  1        1   588
##  2        2   952
##  3        3  2251
##  4        4  9024
##  5        5 21089
##  6        6 70453
##  7        7 96923
##  8        8 91379
##  9        9 13913
## 10       10  7187
## # ... with 16 more rows

The categories IDK/INS (77) ans refused (99) counts for (3552+270)/389181 = 0.98 % of data == > remove

BRFSS_D3 <- BRFSS_D2b %>% 
  filter(SLEPTIM1 < 77)
BRFSS_D3 %>% 
  group_by(SLEPTIM1) %>% 
  count() %>% 
  pander()
SLEPTIM1 n
1 588
2 952
3 2251
4 9024
5 21089
6 70453
7 96923
8 91379
9 13913
10 7187
11 449
12 2223
13 109
14 272
15 226
16 249
17 23
18 124
19 1
20 78
21 8
22 12
23 8
24 32
BRFSS_D3 %>% 
  filter(SLEPTIM1 > 13) %>% 
  count()
##      n
## 1 1033

This number represent 1033/317573 = 0.32%

BRFSS_D3 %>% 
  filter(SLEPTIM1 < 4) %>% 
  count()
##      n
## 1 3791

This number represent 3791/317573 = 1.19% of respondents.

ggplot(data = BRFSS_D3, aes(SLEPTIM1)) + 
  geom_bar(stat= "count", 
                 fill= "blue", 
                 alpha = .75) +  
  coord_cartesian(clip = "off") +
  labs( x = "Number of hours",
        y = "Frequency",
        subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(-1, 25), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 18),
       axis.title = element_text(size =18 ,face="bold"),
       plot.subtitle = element_text(color = "navyblue", size = 18),
       plot.caption = element_text(color = "grey",size = 18, face = "italic")
)

boxplot(BRFSS_D3$SLEPTIM1,
        ylim= c(0,30),
        names = c("days"),
        col = c("blue"),
        ylab = "Duration (hours)", 
        border = c("red"),
        boxwex = 0.5)

Removing all the record where pp sleep more than 12h and less than 3 hours

BRFSS_D3a <- BRFSS_D3 %>% 
  filter(SLEPTIM1 < 13 & SLEPTIM1 > 2)
boxplot(BRFSS_D3a$SLEPTIM1,
        ylim= c(0,30),
        names = c("days"),
        col = c("blue"),
        main = "Distribution of sleep duration", 
        ylab = "Duration (s)", 
        border = c("red"),
        boxwex = 0.5)

ggplot(data = BRFSS_D3a, aes(SLEPTIM1)) + 
  geom_bar(stat= "count", 
                 fill= "blue", 
                 alpha = .75) +  
  coord_cartesian(clip = "off") +
  labs( x = "Number of hours",
        y = "Frequency",
        subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 12),
       axis.title = element_text(size = ,face="bold"),
       plot.subtitle = element_text(color = "navyblue", size = 12),
       plot.caption = element_text(color = "grey",size = 12, face = "italic")
)

BRFSS_D3a %>% 
  summarise(mean_slepduratrion =  mean(SLEPTIM1),
            median_sleepduration = median(SLEPTIM1),
            sd_sleepduration =  sd(SLEPTIM1))
##   mean_slepduratrion median_sleepduration sd_sleepduration
## 1           7.015783                    7         1.317316

Testing the normal distribution

qqnorm(BRFSS_D3a$SLEPTIM1, pch = 1, frame = FALSE)
qqline(BRFSS_D3a$SLEPTIM1, col = "steelblue", lwd = 2)

ggplot(BRFSS_D3a, aes(x=SLEPTIM1)) + 
  geom_histogram(binwidth = 1,
                 color = "blue", 
                 fill = "white") +
  geom_vline(aes(xintercept = mean(SLEPTIM1)),
             color = "red", 
             linetype = "dashed",
             size = 1) +
  labs(x = "Number of hours",
       y = "Frequency",
       subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
       caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 110000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size =14),
       axis.title = element_text(size = ,face="bold"),
       plot.title = element_text(color = "blue", size = 14, face = "bold"),
       plot.subtitle = element_text(color = "navyblue", size = 14),
       plot.caption = element_text(color = "grey",size = 14, face = "italic")
)
## Warning: Removed 2 rows containing missing values (geom_bar).

D4. EXERANY2

Exercise in Past 30 Days

BRFSS_D3a %>% 
  group_by(EXERANY2) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   EXERANY2 [4]
##   EXERANY2      n
##      <dbl>  <int>
## 1        1 240386
## 2        2  74238
## 3        7    227
## 4        9     40

The categories IDK/INS (77) ans refused (99) counts for (40+227)/314891 = 0.12 % of data ==> remove

BRFSS_D4 <- BRFSS_D3a %>% 
  filter(EXERANY2 == 1 | EXERANY2 == 2) %>% 
  mutate(EXERpa30 = case_when(
    EXERANY2 == 1 ~ "Yes",
    EXERANY2 == 2 ~ "No"
    ))
BRFSS_D4 %>% 
  group_by(EXERpa30) %>% 
  count() %>% 
  pander()
EXERpa30 n
No 74238
Yes 240386
BRFSS_D4 <- subset(BRFSS_D4, select = -c(EXERANY2))

E.Health Status

E1. GENHLTH

BRFSS_E1 <- BRFSS_D4  %>% 
  filter(GENHLTH < 7) 

BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 1] <- "Excellent"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 2] <- "Very Good"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 3] <- "Good"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 4] <- "Fair"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 5] <- "Poor"
BRFSS_E1 %>%
  group_by(GENHLTH) %>% 
  count()
## # A tibble: 5 x 2
## # Groups:   GENHLTH [5]
##   GENHLTH        n
##   <chr>      <int>
## 1 Excellent  52772
## 2 Fair       41412
## 3 Good       98141
## 4 Poor       14961
## 5 Very Good 106871
BRFSS_E1$GENHLTH <- factor(BRFSS_E1$GENHLTH,
                                levels =  c("Poor",
                                            "Fair",
                                            "Good",
                                            "Very Good",
                                            "Excellent"), ordered = TRUE)
BRFSS_E1 %>%
  group_by(GENHLTH) %>% 
  count() %>% 
  pander()
GENHLTH n
Poor 14961
Fair 41412
Good 98141
Very Good 106871
Excellent 52772

F.Healthy Days (Health Related Quality of Life, HRQL)

F1. PHYSHLTH (Number of Days Physical Health Not Good)

F2. MENTHLTH (Number of Days Mental Health Not Good)

F3. POORHLTH (Poor Physical or Mental Health)- note 205475 recored are missing

E1. PHYSHLTH

Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?

BRFSS_E1 %>% 
  group_by(PHYSHLTH) %>% 
  count()
## # A tibble: 34 x 2
## # Groups:   PHYSHLTH [34]
##    PHYSHLTH     n
##       <dbl> <int>
##  1        1 13842
##  2        2 18297
##  3        3 10596
##  4        4  5547
##  5        5  9454
##  6        6  1607
##  7        7  5505
##  8        8  1052
##  9        9   276
## 10       10  6964
## # ... with 24 more rows
BRFSS_F1 <- BRFSS_E1 %>% 
  filter(PHYSHLTH < 77 | PHYSHLTH == 88)
BRFSS_F1$PHYSHLTH[BRFSS_F1$PHYSHLTH == 88] <- 0
BRFSS_F1 %>% 
  group_by(PHYSHLTH) %>% 
  count()
## # A tibble: 31 x 2
## # Groups:   PHYSHLTH [31]
##    PHYSHLTH      n
##       <dbl>  <int>
##  1        0 195129
##  2        1  13842
##  3        2  18297
##  4        3  10596
##  5        4   5547
##  6        5   9454
##  7        6   1607
##  8        7   5505
##  9        8   1052
## 10        9    276
## # ... with 21 more rows
ggplot(data = BRFSS_F1, aes(PHYSHLTH)) + 
  geom_bar(stat= "count", 
                 fill= "purple", 
                 alpha = .75) +  
  coord_cartesian(clip = "off") +
  labs( x = "Number of days",
        y = "Frequency",
        subtitle = "How many days during the past 30 days was your physical health not good?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 200000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 18),
       axis.title = element_text(size = ,face="bold"),
       plot.caption = element_text(color = "grey",size = 18, face = "italic")
)

Once again, 10, 15, 20 and 30 seems to be a magic number (as we have seen in the drinking varaible ALC5)

boxplot(BRFSS_F1$PHYSHLTH,
        ylim= c(0,30),
        names = c("days"),
        col = c("blue"),
        main = "Distribution of days", 
        ylab = "Duration (s)", 
        border = c("red"),
        boxwex = 0.5)

regrouping

BRFSS_F1a <-  BRFSS_F1 %>% 
  mutate(PHYSHLTHpa30 = if_else(PHYSHLTH == 0, "0", 
                           if_else(PHYSHLTH >= 1 & PHYSHLTH < 10, "1-9","10-30")))
BRFSS_F1a %>% 
  group_by(PHYSHLTHpa30) %>% 
  count() %>% 
  pander()
PHYSHLTHpa30 n
0 195129
1-9 66176
10-30 48681
BRFSS_F1a <- subset(BRFSS_F1a, select = -c(PHYSHLTH))

F2. MENTHLTH

(Number of Days Mental Health Not Good)

Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?

BRFSS_F1a %>% 
  group_by(MENTHLTH) %>% 
  count()
## # A tibble: 33 x 2
## # Groups:   MENTHLTH [33]
##    MENTHLTH     n
##       <dbl> <int>
##  1        1 10546
##  2        2 16040
##  3        3  9669
##  4        4  4957
##  5        5 11826
##  6        6  1255
##  7        7  4646
##  8        8   968
##  9        9   137
## 10       10  8723
## # ... with 23 more rows
BRFSS_F2 <- BRFSS_F1a %>%
  filter(MENTHLTH < 77 | MENTHLTH == 88)

Converting 88 to 0

BRFSS_F2$MENTHLTH[BRFSS_F2$MENTHLTH == 88] <- 0

Checking

BRFSS_F2 %>% 
  group_by(MENTHLTH) %>% 
  count()
## # A tibble: 31 x 2
## # Groups:   MENTHLTH [31]
##    MENTHLTH      n
##       <dbl>  <int>
##  1        0 203554
##  2        1  10546
##  3        2  16040
##  4        3   9669
##  5        4   4957
##  6        5  11826
##  7        6   1255
##  8        7   4646
##  9        8    968
## 10        9    137
## # ... with 21 more rows
ggplot(data = BRFSS_F2, aes(MENTHLTH)) + 
  geom_bar(stat= "count", 
                 fill= "red", 
                 alpha = .75) +  
  coord_cartesian(clip = "off") +
  labs( x = "Number of days",
        y = "Frequency",
        subtitle = "How many days during the past 30 days was your mental health not good?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 250000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 18),
       plot.title = element_text(color = "blue", size = 20, face = "bold"),
       plot.subtitle = element_text(color = "navyblue", size = 18),
       plot.caption = element_text(color = "grey",size = 18, face = "italic")
)

ggplot(data = BRFSS_F2, aes(MENTHLTH)) + 
  geom_bar(stat= "count", 
                 fill= "red", 
                 alpha = .75) +  
  coord_cartesian(clip = "off") +
  labs( x = "Number of days",
        y = "Frequency",
        subtitle = "How many days during the past 30 days was your mental health not good?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 20000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(1, 31), expand = c(0, 0)) +
  theme(
       axis.text = element_text(size = 18),
       plot.title = element_text(color = "blue", size = 20, face = "bold"),
       plot.subtitle = element_text(color = "navyblue", size = 18),
       plot.caption = element_text(color = "grey",size = 18, face = "italic")
)
## Warning: Removed 203554 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

Regrouping

BRFSS_F2a <-  BRFSS_F2 %>% 
  mutate(MENTHLTHpa30 = if_else(MENTHLTH == 0, "0", 
                           if_else(MENTHLTH >= 1 & MENTHLTH < 10, "1-9","10-30")))
BRFSS_F2a %>% 
  group_by(MENTHLTHpa30) %>% 
  count() %>% 
  pander()
MENTHLTHpa30 n
0 203554
1-9 60044
10-30 43905
BRFSS_F2a <- subset(BRFSS_F2a, select = -c(MENTHLTH))

F3. POORHLTH

BRFSS_F2a %>% 
  group_by(POORHLTH) %>% 
  count()
## # A tibble: 34 x 2
## # Groups:   POORHLTH [34]
##    POORHLTH     n
##       <dbl> <int>
##  1        1  8603
##  2        2  9562
##  3        3  6094
##  4        4  3368
##  5        5  6546
##  6        6  1101
##  7        7  3198
##  8        8   810
##  9        9   181
## 10       10  5273
## # ... with 24 more rows
sum(is.na(BRFSS_F2a$POORHLTH))
## [1] 147173

We should not use this attribute.

BRFSS_F2a <- subset(BRFSS_F2a, select = -c(POORHLTH))

G. Health Care Access

G1. HLTHPLN1 (Have any health care coverage)

G2. MEDCOST (Could Not See Doctor Because of Cost)

G3. CHECKUP1

G1. HLTHPLN1

(Have any health care coverage) —> HLTHPLN

BRFSS_F2a %>% 
  group_by(HLTHPLN1) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   HLTHPLN1 [4]
##   HLTHPLN1      n
##      <dbl>  <int>
## 1        1 283678
## 2        2  23233
## 3        7    459
## 4        9    133
BRFSS_G1 <- BRFSS_F2a %>% 
  filter(HLTHPLN1 == 1 | HLTHPLN1 == 2) %>% 
  mutate(HLTHPLN = case_when(
    HLTHPLN1 == 1 ~ "Yes",
    HLTHPLN1 == 2 ~ "No"))
BRFSS_G1 %>% 
  group_by(HLTHPLN) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   HLTHPLN [2]
##   HLTHPLN      n
##   <chr>    <int>
## 1 No       23233
## 2 Yes     283678
BRFSS_G1 <- subset(BRFSS_G1, select = -c(HLTHPLN1))

G2.MEDCOST

(Could Not See Doctor Because of Cost) —> MEDICOST

BRFSS_G1 %>% 
  group_by(MEDCOST) %>% 
  count()
## # A tibble: 5 x 2
## # Groups:   MEDCOST [5]
##   MEDCOST      n
##     <dbl>  <int>
## 1       1  30826
## 2       2 275050
## 3       7    359
## 4       9     47
## 5      NA    629
BRFSS_G2 <- BRFSS_G1 %>% 
  filter(MEDCOST == 1 | MEDCOST == 2) %>% 
  mutate(MEDICOST= case_when(
    MEDCOST == 1 ~ "Yes",
    MEDCOST == 2 ~ "No"))
BRFSS_G2 %>% 
  group_by(MEDICOST) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   MEDICOST [2]
##   MEDICOST      n
##   <chr>     <int>
## 1 No       275050
## 2 Yes       30826
BRFSS_G2 <- subset(BRFSS_G2, select = -c(MEDCOST))

G3. CHECKUP1 —> CHECKUP

BRFSS_G2 %>% 
  group_by(CHECKUP1) %>% 
  count()
## # A tibble: 7 x 2
## # Groups:   CHECKUP1 [7]
##   CHECKUP1      n
##      <dbl>  <int>
## 1        1 243032
## 2        2  29751
## 3        3  15305
## 4        4  14276
## 5        7   1958
## 6        8   1488
## 7        9     66
BRFSS_G3 <- BRFSS_G2 %>% 
  filter(CHECKUP1 < 5 | CHECKUP1 == 8) %>% 
  mutate(CHECKUP = case_when(CHECKUP1 == 1 ~ "<1",
                             CHECKUP1 == 2 ~ "1-<2",
                             CHECKUP1 == 3 ~ "2-<5",
                             CHECKUP1 == 4 ~ "5+",
                             CHECKUP1 == 8 ~ "5+"
                             ))
BRFSS_G3%>% 
  group_by(CHECKUP) %>% 
  count() %>% 
  pander()
CHECKUP n
<1 243032
1-<2 29751
2-<5 15305
5+ 15764
BRFSS_G3 <- subset(BRFSS_G3, select = -c(CHECKUP1))

Chronic Health Conditions (G)

H1. ASTHMA3

H2. CHCCOPD1 (Ever told) you have chronic obstructive pulmonary disease, emphysema or chronic bronchitis?

H3. HAVARTH3 Told Have Arthritis

H4. ADDEPEV2 Ever told you had a depressive disorder

H5. CHCKDNY1 (Ever told) you have kidney disease?

H6. DIABETE3 (Ever told) you have diabetes)

H7A. CHCSCNCR (Ever told) you had skin cancer? YES/NO

H7B. CHCOCNCR (Ever told) you had any other types of cancer? YES/NO

H8. CVDSTRK3

H1. ASTHMA3 —> ASTHMA

BRFSS_G3 %>% 
  group_by(ASTHMA3) %>% 
  count() %>% 
  pander
ASTHMA3 n
1 42582
2 260647
7 598
9 25
BRFSS_H1 <- BRFSS_G3 %>%
  filter(ASTHMA3 < 7) %>% 
  mutate(ASTHMA = case_when(
    ASTHMA3 == 1 ~ "Yes",
    ASTHMA3 == 2 ~ "No"))
BRFSS_H1  %>% 
  group_by(ASTHMA) %>% 
  count() %>% 
  pander()
ASTHMA n
No 260647
Yes 42582
BRFSS_H1 <- subset(BRFSS_H1, select = -c(ASTHMA3))

H2. CHCCOPD1—> PULMOND

BRFSS_H1 %>% 
  group_by(CHCCOPD1) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   CHCCOPD1 [4]
##   CHCCOPD1      n
##      <dbl>  <int>
## 1        1  24720
## 2        2 277594
## 3        7    902
## 4        9     13
BRFSS_H2 <- BRFSS_H1 %>% 
  filter(CHCCOPD1 < 7) %>% 
  mutate(PULMOND = case_when(CHCCOPD1 == 1 ~ "Yes",
                             CHCCOPD1 == 2 ~ "No"))
BRFSS_H2 <- subset(BRFSS_H2, select = -c(CHCCOPD1))

H3. HAVARTH3

Told Have Arthritis —> ARTHRITIS

ARTHRITIS (Ever told) you have some form of arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia? (Arthritis diagnoses include: rheumatism, polymyalgia rheumatica; osteoarthritis (not osteporosis); tendonitis, bursitis, bunion, tennis elbow; carpal tunnel syndrome, tarsal tunnel syndrome; joint infection, etc.)

BRFSS_H2 %>% 
  group_by(HAVARTH3) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   HAVARTH3 [4]
##   HAVARTH3      n
##      <dbl>  <int>
## 1        1 101580
## 2        2 199636
## 3        7   1086
## 4        9     12
BRFSS_H3 <- BRFSS_H2 %>%
  filter(HAVARTH3 < 7) %>% 
  mutate(ARTHRITIS = case_when(
    HAVARTH3 == 1 ~ "Yes",
    HAVARTH3 == 2 ~ "No"
  ))
BRFSS_H3 %>% 
  group_by(ARTHRITIS) %>% 
  count() %>% 
  pander()
ARTHRITIS n
No 199636
Yes 101580
BRFSS_H3 <- subset(BRFSS_H3, select = -c(HAVARTH3))

H4. ADDEPEV2

Ever told you had a depressive disorder —> DEPRESSION

BRFSS_H3 %>% 
  group_by(ADDEPEV2) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   ADDEPEV2 [4]
##   ADDEPEV2      n
##      <dbl>  <int>
## 1        1  56919
## 2        2 243543
## 3        7    694
## 4        9     60
BRFSS_H4 <- BRFSS_H3 %>% 
  filter(ADDEPEV2 < 7) %>% 
  mutate(DEPRESSION = case_when(ADDEPEV2 == 1 ~ "Yes",
                                ADDEPEV2 == 2 ~ "No"))
BRFSS_H4  %>% 
  group_by(DEPRESSION) %>% 
  count() %>% 
  pander ()
DEPRESSION n
No 243543
Yes 56919
BRFSS_H4 <- subset(BRFSS_H4, select = -c(ADDEPEV2))

H5. CHCKDNY1

(Ever told) you have kidney disease? —> KIDNEY

BRFSS_H4 %>% 
  group_by(CHCKDNY1) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   CHCKDNY1 [4]
##   CHCKDNY1      n
##      <dbl>  <int>
## 1        1  10977
## 2        2 288826
## 3        7    649
## 4        9     10
BRFSS_H5 <- BRFSS_H4 %>% 
  filter(CHCKDNY1 < 7) %>% 
  mutate(KIDNEY = case_when(CHCKDNY1 == 1 ~ "Yes",
                            CHCKDNY1 == 2 ~ "No"))
BRFSS_H5 %>% 
  group_by(KIDNEY) %>% 
  count() %>% 
  pander()
KIDNEY n
No 288826
Yes 10977
BRFSS_H5  <- subset(BRFSS_H5, select = -c(CHCKDNY1))

H6. DIABETE3

(Ever told) you have diabetes) —> DIABETE

BRFSS_H5 %>% 
  group_by(DIABETE3) %>% 
  count()
## # A tibble: 6 x 2
## # Groups:   DIABETE3 [6]
##   DIABETE3      n
##      <dbl>  <int>
## 1        1  40118
## 2        2   2669
## 3        3 251355
## 4        4   5432
## 5        7    219
## 6        9     10
BRFSS_H6 <- BRFSS_H5 %>% 
  filter(DIABETE3 < 7) %>% 
  mutate(DIABETE = case_when(
    (DIABETE3 == 1 | DIABETE3 == 2) ~ "Yes",
    (DIABETE3 == 3 | DIABETE3 == 4) ~ "No"
  ))
BRFSS_H6 %>% 
  group_by(DIABETE) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   DIABETE [2]
##   DIABETE      n
##   <chr>    <int>
## 1 No      256787
## 2 Yes      42787
BRFSS_H6 <- subset(BRFSS_H6, select = -c(DIABETE3))

G7. CHCSCNCR + CHCOCNCR —> CANCER

CHCSCNCR (Ever told) you had skin cancer? YES/NO

CHCOCNCR (Ever told) you had any other types of cancer? YES/NO

BRFSS_H6 %>% 
  group_by(CHCSCNCR) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   CHCSCNCR [4]
##   CHCSCNCR      n
##      <dbl>  <int>
## 1        1  28173
## 2        2 270829
## 3        7    560
## 4        9     12
BRFSS_H6 %>% 
  group_by(CHCOCNCR) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   CHCOCNCR [4]
##   CHCOCNCR      n
##      <dbl>  <int>
## 1        1  28799
## 2        2 270320
## 3        7    428
## 4        9     27
BRFSS_H7 <- BRFSS_H6 %>%
  filter(CHCSCNCR < 7 & CHCOCNCR < 7) %>% 
  mutate(CANCER =  case_when(
    CHCSCNCR == 1 & CHCOCNCR == 1 ~ "Yes",
    CHCSCNCR == 1 & CHCOCNCR == 2 ~ "Yes",
    CHCSCNCR == 2 & CHCOCNCR == 1 ~ "Yes",
    CHCSCNCR == 2 & CHCOCNCR == 2 ~ "No"
  ))
BRFSS_H7 %>% 
  group_by(CANCER) %>% 
  count() %>% 
  pander()
CANCER n
No 248039
Yes 50530

(So far) There are 16.92 % of respondents who have at least one cancer.

BRFSS_H7 <- subset(BRFSS_H7, select = -c(CHCSCNCR,CHCOCNCR))

H8.CVDSTRK3 (Ever told) you had a stroke

BRFSS_H7 %>% 
  group_by(CVDSTRK3) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   CVDSTRK3 [4]
##   CVDSTRK3      n
##      <dbl>  <int>
## 1        1  11904
## 2        2 286174
## 3        7    484
## 4        9      7
BRFSS_H8 <- BRFSS_H7 %>%
  filter(CVDSTRK3 < 7) %>% 
  mutate(STROKE =  case_when(
    CVDSTRK3  == 1 ~ "Yes",
    CVDSTRK3  == 2 ~ "No"
  ))
BRFSS_H8 %>% 
  group_by(STROKE) %>% 
  count() %>% 
  pander()
STROKE n
No 286174
Yes 11904
BRFSS_H8 <- subset(BRFSS_H8, select = -c(CVDSTRK3))

I. Oral Health

I1.LASTDEN4 (Last Visited Dentist or Dental Clinic)

I2.RMVTETH4 (Number of Permanent Teeth Removed)

I1. RMVTETH4 —> RMVTETH

Not including teeth lost for injury or orthodontics, how many of your permanent teeth have been removed because of tooth decay or gum disease?

BRFSS_H8 %>% 
  group_by(RMVTETH4) %>% 
  count()
## # A tibble: 7 x 2
## # Groups:   RMVTETH4 [7]
##   RMVTETH4      n
##      <dbl>  <int>
## 1        1  87317
## 2        2  34351
## 3        3  18417
## 4        7   4233
## 5        8 153600
## 6        9    159
## 7       NA      1

The category “Refused” has 299 records ==> removed these records

BRFSS_I1 <- BRFSS_H8 %>% 
  filter(RMVTETH4 == 8 | RMVTETH4 < 4) %>% 
  mutate(RMVTETH = case_when(
    RMVTETH4 == 1 ~ "1-5",
    RMVTETH4 == 2 ~ ">= 6",
    RMVTETH4 == 3 ~ "All",
    RMVTETH4 == 8 ~ "0"
  ))
BRFSS_I1 %>% 
  group_by(RMVTETH) %>%
  count()
## # A tibble: 4 x 2
## # Groups:   RMVTETH [4]
##   RMVTETH      n
##   <chr>    <int>
## 1 >= 6     34351
## 2 0       153600
## 3 1-5      87317
## 4 All      18417
BRFSS_I1$RMVTETH <- factor(BRFSS_I1$RMVTETH,
  levels= c("0",
            "1-5",
            ">= 6",
            "All"), ordered = TRUE)

BRFSS_I1%>% 
  group_by(RMVTETH) %>%
  count() %>% 
  pander()
RMVTETH n
0 153600
1-5 87317
>= 6 34351
All 18417
BRFSS_I1 <- subset(BRFSS_I1, select = -c(RMVTETH4))

I2. LASTDEN4

BRFSS_I1 %>% 
  group_by(LASTDEN4) %>%
  count()
## # A tibble: 7 x 2
## # Groups:   LASTDEN4 [7]
##   LASTDEN4      n
##      <dbl>  <int>
## 1        1 203716
## 2        2  30679
## 3        3  25735
## 4        4  30912
## 5        7   1409
## 6        8   1164
## 7        9     70
BRFSS_I2 <- BRFSS_I1 %>% 
  filter(LASTDEN4 < 7| LASTDEN4 == 8) %>% 
  mutate(LASTDENTV = case_when(
    LASTDEN4 == 1 ~ "<1",
    LASTDEN4 == 2 ~ "1-<2",
    LASTDEN4 == 3 ~ "2-<5",
    LASTDEN4 == 4 ~ "5+",
    LASTDEN4 == 8 ~ "5+"
  ))

BRFSS_I2 %>% 
  group_by(LASTDENTV) %>% 
  count()
## # A tibble: 4 x 2
## # Groups:   LASTDENTV [4]
##   LASTDENTV      n
##   <chr>      <int>
## 1 <1        203716
## 2 1-<2       30679
## 3 2-<5       25735
## 4 5+         32076
BRFSS_I2$LASTDENTV <- factor(BRFSS_I2$LASTDENTV,
                                levels =  c("<1",
                                            "1-<2",
                                            "2-<5",
                                            "5+"), ordered = TRUE)
BRFSS_I2%>% 
  group_by(LASTDENTV) %>% 
  count() %>% 
  pander()
LASTDENTV n
<1 203716
1-<2 30679
2-<5 25735
5+ 32076
BRFSS_I2 <- subset(BRFSS_I2, select = -c(LASTDEN4))

J. Obesity

J1. X_BMI5CAT—> BMICAT

Four-categories of Body Mass Index (BMI)

1 = underweight, 2 = normal, 3 = overweight, 4 = obese, null = missing.

BRFSS_I2 %>% 
  group_by(X_BMI5CAT) %>% 
  count()
## # A tibble: 5 x 2
## # Groups:   X_BMI5CAT [5]
##   X_BMI5CAT      n
##       <dbl>  <int>
## 1         1   4037
## 2         2  83434
## 3         3 101763
## 4         4  91768
## 5        NA  11204
BRFSS_J1 <- BRFSS_I2 %>% 
  filter(X_BMI5CAT < 5) %>% 
  mutate (BMICAT = case_when(X_BMI5CAT == 1 ~ "Underweight",
                          X_BMI5CAT == 2 ~ "Normal",
                          X_BMI5CAT == 3 ~ "Overweight",
                          X_BMI5CAT == 4 ~ "Obese"))
BRFSS_J1 %>% 
  group_by(BMICAT) %>% 
  count() %>% 
  pander()
BMICAT n
Normal 83434
Obese 91768
Overweight 101763
Underweight 4037
BRFSS_J1$BMICAT <- factor(BRFSS_J1$BMICAT,
                                levels =  c("Underweight",
                                            "Normal",
                                            "Overweight",
                                            "Obese"), ordered = TRUE)
BRFSS_J1 %>% 
  group_by(BMICAT) %>% 
  count() %>% 
  pander()
BMICAT n
Underweight 4037
Normal 83434
Overweight 101763
Obese 91768
BRFSS_J1 <- subset(BRFSS_J1, select = -c(X_BMI5CAT))

MICHD (Outcome) (K)

K. X_MICHD The outcome —> MICHD

BRFSS_J1  %>% 
  group_by(X_MICHD) %>% 
  count()
## # A tibble: 3 x 2
## # Groups:   X_MICHD [3]
##   X_MICHD      n
##     <dbl>  <int>
## 1       1  24594
## 2       2 254978
## 3      NA   1430
BRFSS_K <- BRFSS_J1 %>% 
  filter(X_MICHD < 3) %>% 
  mutate(MICHD =  case_when(X_MICHD == 1 ~ "Yes",
                            X_MICHD == 2 ~ "No"))
BRFSS_K %>% 
  group_by(MICHD) %>% 
  count() %>% 
  pander()
MICHD n
No 254978
Yes 24594

24713/(24713+255253) = 8.8%

BRFSS_K <- subset(BRFSS_K, select = -c(X_MICHD))
str(BRFSS_K)
## 'data.frame':    279572 obs. of  34 variables:
##  $ X_STATE     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ IDAY        : chr  "05" "12" "03" "10" ...
##  $ SLEPTIM1    : num  7 5 6 8 6 7 10 6 6 6 ...
##  $ GENHLTH     : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
##  $ MONTH       : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AGE         : chr  "80+" "30-39" "60-69" "70-79" ...
##  $ GENDER      : chr  "Female" "Female" "Male" "Female" ...
##  $ RACE        : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
##  $ RELP        : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
##  $ EDU         : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
##  $ EMPL        : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
##  $ INCOME      : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
##  $ RENTHOME    : chr  "Own" "Rent" "Own" "Own" ...
##  $ COMMUN      : chr  "Rural" "Urban" "Urban" "Urban" ...
##  $ SMOKESTAT   : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
##  $ ALCpa30     : chr  "0" "1-9" "0" "0" ...
##  $ EXERpa30    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ PHYSHLTHpa30: chr  "10-30" "0" "0" "0" ...
##  $ MENTHLTHpa30: chr  "0" "0" "0" "0" ...
##  $ HLTHPLN     : chr  "Yes" "No" "Yes" "Yes" ...
##  $ MEDICOST    : chr  "No" "Yes" "No" "No" ...
##  $ CHECKUP     : chr  "<1" "1-<2" "<1" "<1" ...
##  $ ASTHMA      : chr  "No" "No" "No" "No" ...
##  $ PULMOND     : chr  "No" "No" "No" "No" ...
##  $ ARTHRITIS   : chr  "Yes" "No" "No" "No" ...
##  $ DEPRESSION  : chr  "No" "No" "No" "No" ...
##  $ KIDNEY      : chr  "No" "No" "No" "No" ...
##  $ DIABETE     : chr  "No" "No" "No" "No" ...
##  $ CANCER      : chr  "Yes" "No" "No" "No" ...
##  $ STROKE      : chr  "No" "No" "No" "No" ...
##  $ RMVTETH     : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
##  $ LASTDENTV   : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
##  $ BMICAT      : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
##  $ MICHD       : chr  "No" "No" "No" "No" ...

Merging with table states

statesUS <- read.csv("state-geocodes-v2015.csv", header=TRUE, sep=",")
str(statesUS)
## 'data.frame':    51 obs. of  3 variables:
##  $ ï..FIPS: int  9 23 25 33 44 50 34 36 42 17 ...
##  $ Name   : chr  "Connecticut" "Maine" "Massachusetts" "New Hampshire" ...
##  $ Region : chr  "Northeast" "Northeast" "Northeast" "Northeast" ...
colnames(statesUS)
## [1] "ï..FIPS" "Name"    "Region"
joined_BRFSS <- left_join(BRFSS_K, statesUS, 
              by = c("X_STATE" = "ï..FIPS"))
str(joined_BRFSS)
## 'data.frame':    279572 obs. of  36 variables:
##  $ X_STATE     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ IDAY        : chr  "05" "12" "03" "10" ...
##  $ SLEPTIM1    : num  7 5 6 8 6 7 10 6 6 6 ...
##  $ GENHLTH     : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
##  $ MONTH       : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AGE         : chr  "80+" "30-39" "60-69" "70-79" ...
##  $ GENDER      : chr  "Female" "Female" "Male" "Female" ...
##  $ RACE        : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
##  $ RELP        : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
##  $ EDU         : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
##  $ EMPL        : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
##  $ INCOME      : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
##  $ RENTHOME    : chr  "Own" "Rent" "Own" "Own" ...
##  $ COMMUN      : chr  "Rural" "Urban" "Urban" "Urban" ...
##  $ SMOKESTAT   : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
##  $ ALCpa30     : chr  "0" "1-9" "0" "0" ...
##  $ EXERpa30    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ PHYSHLTHpa30: chr  "10-30" "0" "0" "0" ...
##  $ MENTHLTHpa30: chr  "0" "0" "0" "0" ...
##  $ HLTHPLN     : chr  "Yes" "No" "Yes" "Yes" ...
##  $ MEDICOST    : chr  "No" "Yes" "No" "No" ...
##  $ CHECKUP     : chr  "<1" "1-<2" "<1" "<1" ...
##  $ ASTHMA      : chr  "No" "No" "No" "No" ...
##  $ PULMOND     : chr  "No" "No" "No" "No" ...
##  $ ARTHRITIS   : chr  "Yes" "No" "No" "No" ...
##  $ DEPRESSION  : chr  "No" "No" "No" "No" ...
##  $ KIDNEY      : chr  "No" "No" "No" "No" ...
##  $ DIABETE     : chr  "No" "No" "No" "No" ...
##  $ CANCER      : chr  "Yes" "No" "No" "No" ...
##  $ STROKE      : chr  "No" "No" "No" "No" ...
##  $ RMVTETH     : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
##  $ LASTDENTV   : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
##  $ BMICAT      : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
##  $ MICHD       : chr  "No" "No" "No" "No" ...
##  $ Name        : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ Region      : chr  "South" "South" "South" "South" ...
joined_BRFSS <- subset(joined_BRFSS, select = -c(X_STATE))
joined_BRFSS <- subset(joined_BRFSS, select = -c(Name))
colnames(joined_BRFSS)
##  [1] "IDAY"         "SLEPTIM1"     "GENHLTH"      "MONTH"        "AGE"         
##  [6] "GENDER"       "RACE"         "RELP"         "EDU"          "EMPL"        
## [11] "INCOME"       "RENTHOME"     "COMMUN"       "SMOKESTAT"    "ALCpa30"     
## [16] "EXERpa30"     "PHYSHLTHpa30" "MENTHLTHpa30" "HLTHPLN"      "MEDICOST"    
## [21] "CHECKUP"      "ASTHMA"       "PULMOND"      "ARTHRITIS"    "DEPRESSION"  
## [26] "KIDNEY"       "DIABETE"      "CANCER"       "STROKE"       "RMVTETH"     
## [31] "LASTDENTV"    "BMICAT"       "MICHD"        "Region"
joined_BRFSS <- joined_BRFSS %>% 
  dplyr::select(IDAY,MONTH, Region, 
         AGE, GENDER, RACE, RELP, EDU,EMPL,INCOME,RENTHOME, COMMUN,
         ALCpa30, SMOKESTAT, EXERpa30, SLEPTIM1,
         GENHLTH, 
         MENTHLTHpa30, PHYSHLTHpa30, 
         HLTHPLN, CHECKUP, MEDICOST,
         ARTHRITIS,ASTHMA,CANCER,DEPRESSION, DIABETE,KIDNEY,PULMOND, STROKE, 
         LASTDENTV,RMVTETH,
         BMICAT,
         MICHD) 
str(joined_BRFSS)
## 'data.frame':    279572 obs. of  34 variables:
##  $ IDAY        : chr  "05" "12" "03" "10" ...
##  $ MONTH       : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Region      : chr  "South" "South" "South" "South" ...
##  $ AGE         : chr  "80+" "30-39" "60-69" "70-79" ...
##  $ GENDER      : chr  "Female" "Female" "Male" "Female" ...
##  $ RACE        : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
##  $ RELP        : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
##  $ EDU         : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
##  $ EMPL        : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
##  $ INCOME      : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
##  $ RENTHOME    : chr  "Own" "Rent" "Own" "Own" ...
##  $ COMMUN      : chr  "Rural" "Urban" "Urban" "Urban" ...
##  $ ALCpa30     : chr  "0" "1-9" "0" "0" ...
##  $ SMOKESTAT   : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
##  $ EXERpa30    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ SLEPTIM1    : num  7 5 6 8 6 7 10 6 6 6 ...
##  $ GENHLTH     : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
##  $ MENTHLTHpa30: chr  "0" "0" "0" "0" ...
##  $ PHYSHLTHpa30: chr  "10-30" "0" "0" "0" ...
##  $ HLTHPLN     : chr  "Yes" "No" "Yes" "Yes" ...
##  $ CHECKUP     : chr  "<1" "1-<2" "<1" "<1" ...
##  $ MEDICOST    : chr  "No" "Yes" "No" "No" ...
##  $ ARTHRITIS   : chr  "Yes" "No" "No" "No" ...
##  $ ASTHMA      : chr  "No" "No" "No" "No" ...
##  $ CANCER      : chr  "Yes" "No" "No" "No" ...
##  $ DEPRESSION  : chr  "No" "No" "No" "No" ...
##  $ DIABETE     : chr  "No" "No" "No" "No" ...
##  $ KIDNEY      : chr  "No" "No" "No" "No" ...
##  $ PULMOND     : chr  "No" "No" "No" "No" ...
##  $ STROKE      : chr  "No" "No" "No" "No" ...
##  $ LASTDENTV   : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
##  $ RMVTETH     : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
##  $ BMICAT      : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
##  $ MICHD       : chr  "No" "No" "No" "No" ...
write.csv(joined_BRFSS,file = "BRFSS2018-V3.csv", row.names= FALSE)