library(foreign)
library(gtools)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(questionr)
## Warning: package 'questionr' was built under R version 4.0.4
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(DataExplorer)
In order to read xpt file, we use the foreign package
library(foreign)
BRFSS_Raw <- read.xport("LLCP2018.XPT")
The raw data contains 437436 rows and 275 columns.
We only keep variables that we need.
The data description is as follow
Record Identification (A)
A1. IYEAR: Interview Year
A2. IMONTH: Interview Month
A3. IDAY: Interview Day
A4. X_STATE: State FIPS Code
Demographics (B) B1. X_AGEG5YR : Reported age in five-year age categories calculated variable
B2. SEX1: Respondents Sex
B3. X_RACE: race/ethnicity value
B4. MARITAL : Marital Status
B5. EDUCA : Education Level
B6. EMPLOY1 : Employment Status
B7. INCOME1 : Income Level
B8. RENTHOM1 : Do you own or rent your home?
Urban status (C)
C1. X_METSTAT: Metropolitan Status
C2. X_URBSTAT: Urban/Rural Status
Lifestyle (D)
D1. X_SMOKER3 Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
D2. ALCDAY5: Days in past 30 had alcoholic beverage
D3. SLEPTIM1: How Much Time Do You Sleep
D4. EXERANY2: Exercise in Past 30 Days
Health Status (E)
E1. GENHLTH :Would you say that in general your health is:
Healthy Days (F)— Health Related Quality of Life
F1. PHYSHLTH : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
F2. MENTHLTH : Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
F3. POORHLTH : During the past 30 days, for about how many days did poor physical or mental health keep you from doing your usual activities, such as self-care, work, or recreation?
Health Care Access (G)
G1. HLTHPLN1 : Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare, or Indian Health Service?)
G2. MEDCOST : Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?
G3. CHECKUP1 : About how long has it been since you last visited a doctor for a routine checkup? [A routine checkup is a general physical exam, not an exam for a specific injury, illness, or condition.]
Chronic Health Conditions (H) H1. ASTHMA3 : Ever Told Had Asthma
H2. CHCCOPD1: (Ever told) you have chronic obstructive pulmonary disease, emphysema or chronic bronchitis?
H3. HAVARTH3 : (Ever told) you have some form of arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia? (Arthritis diagnoses include: rheumatism, polymyalgia rheumatica; osteoarthritis (not osteporosis); tendonitis, bursitis, bunion, tennis elbow; carpal tunnel syndrome, tarsal tunnel syndrome; joint infection, etc.)
H4. ADDEPEV2 (Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?
H5. CHCKDNY1 : (Ever told) you have kidney disease? (Do NOT include kidney stones, bladder infection or incontinence.)
H6. DIABETE3: (Ever told) you have diabetes (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´. If Respondent says pre-diabetes or borderline diabetes, use response code 4.)
H7. CHCSCNCR : (Ever told) you had skin cancer?
H8. CHCOCNCR : (Ever told) you had any other types of cancer?
H9. CVDSTRK3 : (Ever told) you had a stroke.
Oral Health (I)
I1. LASTDEN4 : Including all types of dentists, such as orthodontists, oral surgeons, and all other dental specialists, as well as dental hygienists, how long has it been since you last visited a dentist or a dental clinic for any reason?
I2. RMVTETH4 : Not including teeth lost for injury or orthodontics, how many of your permanent teeth have been removed because of tooth decay or gum disease?
Obesity (J) J1. X_BMI5CAT : Four-categories of Body Mass Index (BMI)
MICHD (Outcome) (K) K1. X_MICHD : Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI)
Defining object list of variables to be kept
BRFSS_VarList <- c("X_STATE", "IMONTH", "IDAY", "IYEAR", # Record Identification
"X_AGEG5YR","SEX1", "X_RACE", "MARITAL", "EDUCA", "INCOME2", "EMPLOY1", "RENTHOM1",# Demographics
"X_METSTAT","X_URBSTAT", # Urban Rural , Metropolitan, Non-metropolitan
"ALCDAY5", "X_SMOKER3", "EXERANY2", "SLEPTIM1", # Lifestyle
"GENHLTH", # Health Status
"MENTHLTH", "PHYSHLTH", "POORHLTH", # Healthy Days - HRQOL
"HLTHPLN1", "CHECKUP1", "MEDCOST", # Health Care Access
"RMVTETH4", "LASTDEN4", # Oral Health
"CVDSTRK3","DIABETE3", "HAVARTH3", "ASTHMA3", "CHCCOPD1", "ADDEPEV2", "CHCSCNCR", "CHCOCNCR", "CHCKDNY1", # Chronic Health Conditions
"X_BMI5CAT",# Obesity
"X_MICHD")# the outcome
Subseting by varlist
BRFSS_selected <- BRFSS_Raw[BRFSS_VarList]
str(BRFSS_selected)
## 'data.frame': 437436 obs. of 38 variables:
## $ X_STATE : num 1 1 1 1 1 1 1 1 1 1 ...
## $ IMONTH : chr "01" "01" "01" "01" ...
## $ IDAY : chr "05" "12" "08" "03" ...
## $ IYEAR : chr "2018" "2018" "2018" "2018" ...
## $ X_AGEG5YR: num 13 3 12 10 5 13 12 6 8 6 ...
## $ SEX1 : num 2 2 2 1 2 2 2 2 1 2 ...
## $ X_RACE : num 1 2 1 1 1 1 1 1 1 2 ...
## $ MARITAL : num 3 5 3 2 1 3 1 1 1 2 ...
## $ EDUCA : num 6 6 4 4 6 5 6 6 4 6 ...
## $ INCOME2 : num 6 4 3 3 99 99 8 99 8 8 ...
## $ EMPLOY1 : num 2 1 7 7 1 7 5 5 1 1 ...
## $ RENTHOM1 : num 1 2 1 1 1 1 1 1 1 2 ...
## $ X_METSTAT: num 2 1 2 1 2 1 1 1 2 1 ...
## $ X_URBSTAT: num 2 1 1 1 1 1 1 1 2 1 ...
## $ ALCDAY5 : num 888 202 888 888 888 888 888 201 101 888 ...
## $ X_SMOKER3: num 4 1 4 4 4 4 4 4 3 4 ...
## $ EXERANY2 : num 2 1 1 1 2 2 1 1 2 1 ...
## $ SLEPTIM1 : num 7 5 7 6 7 6 8 7 6 7 ...
## $ GENHLTH : num 2 3 5 1 2 2 1 2 2 3 ...
## $ MENTHLTH : num 88 88 88 88 88 88 88 88 88 88 ...
## $ PHYSHLTH : num 30 88 10 88 88 88 88 88 88 5 ...
## $ POORHLTH : num 30 NA 88 NA NA NA NA NA NA 88 ...
## $ HLTHPLN1 : num 1 2 1 1 1 1 1 1 1 1 ...
## $ CHECKUP1 : num 1 2 1 1 1 1 1 1 1 1 ...
## $ MEDCOST : num 2 1 2 2 2 2 2 2 2 2 ...
## $ RMVTETH4 : num 1 8 7 1 8 7 8 8 8 8 ...
## $ LASTDEN4 : num 1 2 1 3 1 1 1 1 1 1 ...
## $ CVDSTRK3 : num 2 2 1 2 2 2 2 2 2 2 ...
## $ DIABETE3 : num 3 3 1 3 3 4 3 3 1 3 ...
## $ HAVARTH3 : num 1 2 2 2 2 2 2 2 2 2 ...
## $ ASTHMA3 : num 2 2 2 2 2 1 2 2 2 2 ...
## $ CHCCOPD1 : num 2 2 2 2 2 2 2 2 2 1 ...
## $ ADDEPEV2 : num 2 2 1 2 2 2 2 2 2 2 ...
## $ CHCSCNCR : num 1 2 2 2 2 2 2 2 2 2 ...
## $ CHCOCNCR : num 2 2 2 2 2 2 2 2 2 2 ...
## $ CHCKDNY1 : num 2 2 2 2 2 2 2 2 2 2 ...
## $ X_BMI5CAT: num 2 4 3 3 NA 4 2 2 3 3 ...
## $ X_MICHD : num 2 2 2 2 2 1 2 2 2 2 ...
The selected data has 38 columns.
This step is mainly for dealing with recoding, missing value, and outliers.
library(DataExplorer)
introduce(BRFSS_selected)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 437436 38 3 35 0
## total_missing_values complete_rows total_observations memory_usage
## 1 281851 204754 16622568 132991392
There is 16622568 observations and 281851 missing values.
There are only 204755 complete rows.
plot_intro(BRFSS_selected)
To view missing value distribution for data
plot_missing(BRFSS_selected)
47% of rows in the column POORHLTH is missing.
library(pander)
## Warning: package 'pander' was built under R version 4.0.5
library(forcats)
Record Identification (A) A1. IYEAR
A2. IMONTH
A3. IDAY
A4. X_STATE
A1. IYEAR
BRFSS_selected %>%
group_by(IYEAR) %>%
count()
## # A tibble: 2 x 2
## # Groups: IYEAR [2]
## IYEAR n
## <chr> <int>
## 1 2018 418474
## 2 2019 18962
There are 18962 interviews in 2019. We are only interested in interviews in 2018.
There for, all the 2019 records will be eleminated.
BRFSS_A1 <- BRFSS_selected %>%
filter(IYEAR == 2018)
BRFSS_A1 <- subset(BRFSS_A1, select = -c(IYEAR))
A2-IMONTH
BRFSS_A1 %>%
group_by(IMONTH) %>%
count() %>%
pander()
IMONTH | n |
---|---|
01 | 15066 |
02 | 36716 |
03 | 39268 |
04 | 35296 |
05 | 33657 |
06 | 35775 |
07 | 37295 |
08 | 35999 |
09 | 35961 |
10 | 40429 |
11 | 37420 |
12 | 35592 |
Note: Less interviews were done in January. Recoding
BRFSS_A2 <- BRFSS_A1 %>%
mutate(MONTH = case_when(
IMONTH == "01" ~ "Jan",
IMONTH == "02" ~ "Fev",
IMONTH == "03" ~ "Mar",
IMONTH == "04" ~ "Apr",
IMONTH == "05" ~ "May",
IMONTH == "06" ~ "June",
IMONTH == "07" ~ "July",
IMONTH == "08" ~ "Aug",
IMONTH == "09" ~ "Sept",
IMONTH == "10" ~ "Oct",
IMONTH == "11" ~ "Nov",
IMONTH == "12" ~ "Dec"))
Checking
BRFSS_A2 %>%
group_by(MONTH) %>%
count() %>%
pander()
MONTH | n |
---|---|
Apr | 35296 |
Aug | 35999 |
Dec | 35592 |
Fev | 36716 |
Jan | 15066 |
July | 37295 |
June | 35775 |
Mar | 39268 |
May | 33657 |
Nov | 37420 |
Oct | 40429 |
Sept | 35961 |
The result appears in the alphabetic order.
Now, we have to reorder these levels.
BRFSS_A2$MONTH <- factor(BRFSS_A2$MONTH,
levels = c("Jan", "Fev", "Mar", "Apr",
"May", "June", "July", "Aug",
"Sept", "Oct", "Nov", "Dec"), ordered = TRUE)
BRFSS_A2 %>%
group_by(MONTH) %>%
count() %>%
pander()
MONTH | n |
---|---|
Jan | 15066 |
Fev | 36716 |
Mar | 39268 |
Apr | 35296 |
May | 33657 |
June | 35775 |
July | 37295 |
Aug | 35999 |
Sept | 35961 |
Oct | 40429 |
Nov | 37420 |
Dec | 35592 |
BRFSS_A2 <- subset(BRFSS_A2, select = -c(IMONTH))
A3. IDAY
BRFSS_A2 %>%
group_by(IDAY) %>%
count() %>%
pander()
IDAY | n |
---|---|
01 | 11548 |
02 | 12494 |
03 | 13443 |
04 | 12836 |
05 | 15427 |
06 | 16713 |
07 | 16674 |
08 | 16012 |
09 | 15461 |
10 | 15954 |
11 | 15153 |
12 | 16435 |
13 | 15826 |
14 | 15602 |
15 | 15305 |
16 | 14819 |
17 | 14992 |
18 | 14583 |
19 | 15149 |
20 | 14379 |
21 | 13002 |
22 | 12487 |
23 | 12591 |
24 | 11641 |
25 | 11157 |
26 | 12414 |
27 | 11684 |
28 | 10370 |
29 | 9955 |
30 | 9408 |
31 | 4960 |
BRFSS_A3 <- BRFSS_A2
Demographics (B)
B1. X_AGEG5YR
B2. SEX1 Respondents Sex
B3. X_RACE
B4. MARITAL Marital Status
B5. EDUCA Education Level
B6. EMPLOY1 Employment Status
B7. INCOME1 Income Level
B1. X_AGEG5YR —> AGE
BRFSS_A3%>%
group_by(X_AGEG5YR) %>%
count() %>%
pander()
X_AGEG5YR | n |
---|---|
1 | 24776 |
2 | 21237 |
3 | 23178 |
4 | 24865 |
5 | 24330 |
6 | 27446 |
7 | 33444 |
8 | 40294 |
9 | 45005 |
10 | 45300 |
11 | 39298 |
12 | 27650 |
13 | 33522 |
14 | 8129 |
There are 14 categories (Please look at the data dictionary for the description of each value/label). The label 14 is Don’t know/Refused/Missing (1.94%).
We will regroup the first 13 rows into 7 groups.
BRFSS_B1 <- BRFSS_A2 %>%
filter(X_AGEG5YR < 14) %>%
mutate(AGE = case_when(
X_AGEG5YR == 1 ~ "18-29",
X_AGEG5YR == 2 ~ "18-29",
X_AGEG5YR == 3 ~ "30-39",
X_AGEG5YR == 4 ~ "30-39",
X_AGEG5YR == 5 ~ "40-49",
X_AGEG5YR == 6 ~ "40-49",
X_AGEG5YR == 7 ~ "50-59",
X_AGEG5YR == 8 ~ "50-59",
X_AGEG5YR == 9 ~ "60-69",
X_AGEG5YR == 10 ~ "60-69",
X_AGEG5YR == 11 ~ "70-79",
X_AGEG5YR == 12 ~ "70-79",
X_AGEG5YR == 13 ~ "80+"))
Note: Function recode does not work
BRFSS_B1 %>%
group_by(AGE) %>%
count() %>%
pander()
AGE | n |
---|---|
18-29 | 46013 |
30-39 | 48043 |
40-49 | 51776 |
50-59 | 73738 |
60-69 | 90305 |
70-79 | 66948 |
80+ | 33522 |
Removing the column X_AGEG5YR
BRFSS_B1 <- subset(BRFSS_B1, select = -c(X_AGEG5YR))
B2. SEX1 —> GENDER
BRFSS_B1 %>%
group_by(SEX1) %>%
count()
## # A tibble: 4 x 2
## # Groups: SEX1 [4]
## SEX1 n
## <dbl> <int>
## 1 1 185889
## 2 2 223701
## 3 7 391
## 4 9 364
For the SEX attribute, we exclude any values of 7 (Don’t know/Not Sure, 0.093% of data) and 9 (refused, 0.089 % of data). Also, we change the column name into GENDER. (401/428964 = 0.093)
BRFSS_B2 <- BRFSS_B1 %>%
filter(SEX1 == 1 | SEX1 == 2) %>%
mutate(GENDER = case_when(
SEX1 == 1 ~ "Male",
SEX1 == 2 ~ "Female"))
Checking
BRFSS_B2 %>%
group_by(GENDER) %>%
count() %>%
pander()
GENDER | n |
---|---|
Female | 223701 |
Male | 185889 |
BRFSS_B2 <- subset(BRFSS_B2, select = -c(SEX1))
B3. X_RACE —> RACE
BRFSS_B2 %>%
group_by(X_RACE) %>%
count() %>%
pander()
X_RACE | n |
---|---|
1 | 307499 |
2 | 32980 |
3 | 7138 |
4 | 9233 |
5 | 1947 |
6 | 2940 |
7 | 8064 |
8 | 33378 |
9 | 6410 |
NA | 1 |
BRFSS_B3 <- BRFSS_B2 %>%
filter(X_RACE < 9 ) %>%
mutate(RACE = case_when(
X_RACE == 1 ~ "White",
X_RACE == 2 ~ "Black",
X_RACE == 3 ~ "AIAN",
X_RACE == 4 ~ "Asian",
X_RACE == 5 ~ "Others",
X_RACE == 6 ~ "Others",
X_RACE == 7 ~ "Others",
X_RACE == 8 ~ "Hispanic"))
BRFSS_B3 %>%
group_by(RACE) %>%
count() %>%
pander()
RACE | n |
---|---|
AIAN | 7138 |
Asian | 9233 |
Black | 32980 |
Hispanic | 33378 |
Others | 12951 |
White | 307499 |
BRFSS_B3$RACE <- factor(BRFSS_B3$RACE,
levels = c("AIAN",
"Asian",
"Black",
"Hispanic",
"White",
"Others"), ordered = TRUE)
BRFSS_B3 %>%
group_by(RACE) %>%
count() %>%
pander()
RACE | n |
---|---|
AIAN | 7138 |
Asian | 9233 |
Black | 32980 |
Hispanic | 33378 |
White | 307499 |
Others | 12951 |
BRFSS_B3 <- subset(BRFSS_B3, select = -c(X_RACE))
B4. MARITAL attribute
BRFSS_B3 %>%
group_by(MARITAL) %>%
count() %>%
pander()
MARITAL | n |
---|---|
1 | 206746 |
2 | 55641 |
3 | 47879 |
4 | 8720 |
5 | 68716 |
6 | 13848 |
9 | 1627 |
NA | 2 |
BRFSS_B4 <- BRFSS_B3 %>%
filter(MARITAL < 9) %>%
mutate(
RELP = case_when(
MARITAL == 1 ~ "In relationship",
MARITAL == 6 ~ "In relationship",
MARITAL == 2 ~ "Living apart",
MARITAL == 4 ~ "Living apart",
MARITAL == 3 ~ "Widowed",
MARITAL == 5 ~ "Never married")
)
Checking
BRFSS_B4 %>%
group_by(RELP) %>%
count() %>%
pander()
RELP | n |
---|---|
In relationship | 220594 |
Living apart | 64361 |
Never married | 68716 |
Widowed | 47879 |
BRFSS_B4$RELP <- factor(BRFSS_B4$RELP,
levels = c("In relationship",
"Living apart",
"Never married",
"Widowed"), ordered = TRUE)
BRFSS_B4 %>%
group_by(RELP) %>%
count() %>%
pander()
RELP | n |
---|---|
In relationship | 220594 |
Living apart | 64361 |
Never married | 68716 |
Widowed | 47879 |
BRFSS_B4 <- subset(BRFSS_B4, select = -c(MARITAL))
B5. EDUCA attribute ===> EDU
BRFSS_B4 %>%
group_by(EDUCA) %>%
count()
## # A tibble: 7 x 2
## # Groups: EDUCA [7]
## EDUCA n
## <dbl> <int>
## 1 1 587
## 2 2 9483
## 3 3 19772
## 4 4 110163
## 5 5 110733
## 6 6 150154
## 7 9 658
BRFSS_B5 <- BRFSS_B4 %>%
filter(EDUCA < 9) %>%
mutate(EDU = case_when(
EDUCA == 1 ~ "<8",
EDUCA == 2 ~ "<8",
EDUCA == 3 ~ "9-12",
EDUCA == 4 ~ "9-12",
EDUCA == 5 ~ "12-15",
EDUCA == 6 ~ "16+"))
BRFSS_B5 %>%
group_by(EDU) %>%
count()
## # A tibble: 4 x 2
## # Groups: EDU [4]
## EDU n
## <chr> <int>
## 1 <8 10070
## 2 12-15 110733
## 3 16+ 150154
## 4 9-12 129935
BRFSS_B5$EDU <- factor(BRFSS_B5$EDU,
levels = c("<8",
"9-12",
"12-15",
"16+"), ordered = TRUE)
BRFSS_B5 %>%
group_by(EDU) %>%
count() %>%
pander()
EDU | n |
---|---|
<8 | 10070 |
9-12 | 129935 |
12-15 | 110733 |
16+ | 150154 |
BRFSS_B5 <- subset(BRFSS_B5, select = -c(EDUCA))
B6. EMPLOY1 ===> EMPL
Looking at Employment and age.
BRFSS_B5 %>%
group_by(EMPLOY1) %>%
count()
## # A tibble: 10 x 2
## # Groups: EMPLOY1 [10]
## EMPLOY1 n
## <dbl> <int>
## 1 1 165540
## 2 2 36094
## 3 3 7455
## 4 4 7783
## 5 5 19506
## 6 6 10717
## 7 7 120524
## 8 8 29906
## 9 9 2380
## 10 NA 987
The category Refused (9) represente 2380/400892 = 0.59% of the total number of respondents
BRFSS_B6 <- BRFSS_B5 %>%
filter(EMPLOY1 < 9) %>%
mutate(EMPL = case_when(
EMPLOY1 == 1 ~ "Employed",
EMPLOY1 == 2 ~ "Employed",
EMPLOY1 == 3 ~ "Unemployed",
EMPLOY1 == 4 ~ "Unemployed",
EMPLOY1 == 5 ~ "Occupations",
EMPLOY1 == 6 ~ "Occupations",
EMPLOY1 == 7 ~ "Retired",
EMPLOY1 == 8 ~ "Unemployed"))
occupation: studen and homemakers
Checking
BRFSS_B6 %>%
group_by(EMPL) %>%
count()
## # A tibble: 4 x 2
## # Groups: EMPL [4]
## EMPL n
## <chr> <int>
## 1 Employed 201634
## 2 Occupations 30223
## 3 Retired 120524
## 4 Unemployed 45144
BRFSS_B6$EMPL <- factor(BRFSS_B6$EMPL,
levels = c("Employed",
"Unemployed",
"Retired",
"Occupations"), ordered = TRUE)
BRFSS_B6 %>%
group_by(EMPL) %>%
count() %>%
pander()
EMPL | n |
---|---|
Employed | 201634 |
Unemployed | 45144 |
Retired | 120524 |
Occupations | 30223 |
BRFSS_B6 <- subset(BRFSS_B6, select = -c(EMPLOY1))
B7. INCOME2 attribute
BRFSS_B6 %>%
group_by(INCOME2) %>%
count()
## # A tibble: 11 x 2
## # Groups: INCOME2 [11]
## INCOME2 n
## <dbl> <int>
## 1 1 14954
## 2 2 16579
## 3 3 23764
## 4 4 29677
## 5 5 35224
## 6 6 46109
## 7 7 54438
## 8 8 114372
## 9 77 29830
## 10 99 30016
## 11 NA 2562
77 - Don’t know/Not sure
99 - Refused
BRFSS_B7 <- BRFSS_B6 %>%
filter(INCOME2 < 77) %>%
mutate(INCOME = case_when(
INCOME2 == 1 ~ "<$20K",
INCOME2 == 2 ~ "<$20K",
INCOME2 == 3 ~ "<$20K",
INCOME2 == 4 ~ "$20K-<$35K",
INCOME2 == 5 ~ "$20K-<$35K",
INCOME2 == 6 ~ "$35K-<$50K",
INCOME2 == 7 ~ "$50K-<$75K",
INCOME2 == 8 ~ "$75K+"))
BRFSS_B7 %>%
group_by(INCOME) %>%
count() %>%
pander()
INCOME | n |
---|---|
$20K-<$35K | 64901 |
$35K-<$50K | 46109 |
$50K-<$75K | 54438 |
$75K+ | 114372 |
<$20K | 55297 |
BRFSS_B7$INCOME <- factor(BRFSS_B7$INCOME,
levels = c("<$20K",
"$20K-<$35K",
"$35K-<$50K",
"$50K-<$75K",
"$75K+"), ordered = TRUE)
BRFSS_B7 %>%
group_by(INCOME) %>%
count() %>%
pander()
INCOME | n |
---|---|
<$20K | 55297 |
$20K-<$35K | 64901 |
$35K-<$50K | 46109 |
$50K-<$75K | 54438 |
$75K+ | 114372 |
BRFSS_B7 <- subset(BRFSS_B7, select = -c(INCOME2))
B8. RENTHOM1 ==> RENTHOME
Do you own or rent your home?
BRFSS_B7 %>%
group_by(RENTHOM1) %>%
count() %>%
pander()
RENTHOM1 | n |
---|---|
1 | 238586 |
2 | 82289 |
3 | 13595 |
7 | 432 |
9 | 215 |
BRFSS_B8 <- BRFSS_B7 %>%
filter(RENTHOM1 <= 3) %>%
mutate(RENTHOME = case_when(
RENTHOM1 == 1 ~ "Own",
RENTHOM1 == 2 ~ "Rent",
RENTHOM1 == 3 ~ "Other"))
BRFSS_B8 %>%
group_by(RENTHOME) %>%
count() %>%
pander()
RENTHOME | n |
---|---|
Other | 13595 |
Own | 238586 |
Rent | 82289 |
BRFSS_B8 <- subset(BRFSS_B8, select = -c(RENTHOM1))
Urban Rural (C)
**C1. _METSTAT**
C2. X_URBSTAT —> Community
BRFSS_B8 %>%
group_by(X_METSTAT) %>%
count()
## # A tibble: 3 x 2
## # Groups: X_METSTAT [3]
## X_METSTAT n
## <dbl> <int>
## 1 1 226904
## 2 2 103903
## 3 NA 3663
3663 records: Not define or missing
BRFSS_B8 %>%
group_by(X_URBSTAT) %>%
count()
## # A tibble: 3 x 2
## # Groups: X_URBSTAT [3]
## X_URBSTAT n
## <dbl> <int>
## 1 1 280125
## 2 2 50682
## 3 NA 3663
BRFSS_C1 <- BRFSS_B8 %>%
filter(X_METSTAT == 1 | X_METSTAT == 2|X_URBSTAT== 1 | X_URBSTAT == 2) %>%
mutate(COMMUN = case_when(
X_METSTAT == 1 & X_URBSTAT == 1 ~ "Urban",
X_METSTAT == 2 & X_URBSTAT == 1 ~ "Suburban",
X_URBSTAT == 2 ~ "Rural" ))
BRFSS_C1 %>%
group_by(COMMUN) %>%
count() %>%
pander()
COMMUN | n |
---|---|
Rural | 50682 |
Suburban | 53221 |
Urban | 226904 |
BRFSS_C1 <- subset(BRFSS_C1, select = -c(X_METSTAT, X_URBSTAT))
D.Lifestyle
D1. X_SMOKER3 Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
D2. ALCDAY5 Days in past 30 had alcoholic beverage
D3. SLEPTIM1 How Much Time Do You Sleep
D4. EXERANY2 Exercise in Past 30 Days
D1. X_SMOKER3 ===> SMOKESTAT
BRFSS_C1 %>%
group_by(X_SMOKER3) %>%
count()
## # A tibble: 5 x 2
## # Groups: X_SMOKER3 [5]
## X_SMOKER3 n
## <dbl> <int>
## 1 1 34756
## 2 2 13898
## 3 3 93590
## 4 4 181708
## 5 9 6855
BRFSS_D1 <- BRFSS_C1 %>%
filter(X_SMOKER3 < 9) %>%
mutate(SMOKESTAT = case_when(
X_SMOKER3 == 1 ~ "Daily smoker",
X_SMOKER3 == 2 ~ "Someday smoker",
X_SMOKER3 == 3 ~ "Former smoker",
X_SMOKER3 == 4 ~ "Never smoker"
))
BRFSS_D1 %>%
group_by(SMOKESTAT) %>%
count() %>%
pander()
SMOKESTAT | n |
---|---|
Daily smoker | 34756 |
Former smoker | 93590 |
Never smoker | 181708 |
Someday smoker | 13898 |
Columns need to be reordered
BRFSS_D1$SMOKESTAT <- factor(BRFSS_D1$SMOKESTAT,
levels = c("Daily smoker",
"Someday smoker",
"Former smoker",
"Never smoker"), ordered = TRUE)
BRFSS_D1 %>%
group_by(SMOKESTAT) %>%
count() %>%
pander()
SMOKESTAT | n |
---|---|
Daily smoker | 34756 |
Someday smoker | 13898 |
Former smoker | 93590 |
Never smoker | 181708 |
BRFSS_D1 <- subset(BRFSS_D1, select = -c(X_SMOKER3))
D2. ALCDAY5
Should we create only two levels: drinker and non-drinker ? weakly drinker
How many drink per week you have
BRFSS_D1 %>%
group_by(ALCDAY5) %>%
count() %>%
pander()
ALCDAY5 | n |
---|---|
101 | 14082 |
102 | 11485 |
103 | 8158 |
104 | 3928 |
105 | 3747 |
106 | 1256 |
107 | 6197 |
201 | 25700 |
202 | 19546 |
203 | 11411 |
204 | 9555 |
205 | 9115 |
206 | 3531 |
207 | 2468 |
208 | 2810 |
209 | 217 |
210 | 7438 |
211 | 46 |
212 | 1536 |
213 | 87 |
214 | 472 |
215 | 5866 |
216 | 247 |
217 | 71 |
218 | 207 |
219 | 11 |
220 | 5975 |
221 | 184 |
222 | 137 |
223 | 55 |
224 | 196 |
225 | 2603 |
226 | 134 |
227 | 207 |
228 | 723 |
229 | 290 |
230 | 13156 |
777 | 2170 |
888 | 146951 |
999 | 584 |
NA | 1400 |
Collect only monthly drinker
888 : no drink
BRFSS_D2 <- BRFSS_D1 %>%
filter(ALCDAY5 < 777 | ALCDAY5 == 888) %>%
mutate(ALC = case_when(
ALCDAY5 == 888 ~ 200,
ALCDAY5 == 101 ~ 204,
ALCDAY5 == 102 ~ 208,
ALCDAY5 == 103 ~ 212,
ALCDAY5 == 104 ~ 216,
ALCDAY5 == 105 ~ 220,
ALCDAY5 == 106 ~ 224,
ALCDAY5 == 107 ~ 228,
ALCDAY5 == 201 ~ 201,
ALCDAY5 == 202 ~ 202,
ALCDAY5 == 203 ~ 203,
ALCDAY5 == 204 ~ 204,
ALCDAY5 == 205 ~ 205,
ALCDAY5 == 206 ~ 206,
ALCDAY5 == 207 ~ 207,
ALCDAY5 == 208 ~ 208,
ALCDAY5 == 209 ~ 209,
ALCDAY5 == 210 ~ 210,
ALCDAY5 == 211 ~ 211,
ALCDAY5 == 212 ~ 212,
ALCDAY5 == 213 ~ 213,
ALCDAY5 == 214 ~ 214,
ALCDAY5 == 215 ~ 215,
ALCDAY5 == 216 ~ 216,
ALCDAY5 == 217 ~ 207,
ALCDAY5 == 218 ~ 218,
ALCDAY5 == 219 ~ 219,
ALCDAY5 == 220 ~ 220,
ALCDAY5 == 221 ~ 221,
ALCDAY5 == 222 ~ 222,
ALCDAY5 == 223 ~ 223,
ALCDAY5 == 224 ~ 224,
ALCDAY5 == 225 ~ 225,
ALCDAY5 == 226 ~ 226,
ALCDAY5 == 227 ~ 227,
ALCDAY5 == 228 ~ 228,
ALCDAY5 == 229 ~ 229,
ALCDAY5 == 230 ~ 230))
BRFSS_D2 %>%
group_by(ALC) %>%
count()
## # A tibble: 30 x 2
## # Groups: ALC [30]
## ALC n
## <dbl> <int>
## 1 200 146951
## 2 201 25700
## 3 202 19546
## 4 203 11411
## 5 204 23637
## 6 205 9115
## 7 206 3531
## 8 207 2539
## 9 208 14295
## 10 209 217
## # ... with 20 more rows
Now we need to subtract the value of the whole column by 200
BRFSS_D2a <- BRFSS_D2 %>%
mutate(ALCMONTH = ALC - 200)
BRFSS_D2a %>%
group_by(ALCMONTH) %>%
count()
## # A tibble: 30 x 2
## # Groups: ALCMONTH [30]
## ALCMONTH n
## <dbl> <int>
## 1 0 146951
## 2 1 25700
## 3 2 19546
## 4 3 11411
## 5 4 23637
## 6 5 9115
## 7 6 3531
## 8 7 2539
## 9 8 14295
## 10 9 217
## # ... with 20 more rows
ggplot(data = BRFSS_D2a, aes(ALCMONTH)) +
geom_bar(stat= "count",
fill= "blue",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of days",
y = "Frequency",
subtitle = "How many days per month did you have at least one drink during the past 30 days?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 150000), expand = c(0, 0)) +
scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 12,face="bold"),
plot.title = element_text(color = "blue", size = 16, face = "bold"),
plot.subtitle = element_text(color = "navyblue", size = 12),
plot.caption = element_text(color = "grey",size = 16, face = "italic")
)
BRFSS_D2b <- BRFSS_D2a %>%
mutate(ALCpa30 = if_else(ALCMONTH == 0, "0",
if_else(ALCMONTH >= 1 & ALCMONTH < 10, "1-9","10-30")))
BRFSS_D2b %>%
group_by(ALCpa30) %>%
count() %>%
pander()
ALCpa30 | n |
---|---|
0 | 146951 |
1-9 | 109991 |
10-30 | 62856 |
BRFSS_D2b <- subset(BRFSS_D2b, select = -c(ALCDAY5, ALC, ALCMONTH))
D3. SLEPTIM1 (Numeric variable)
On average, how many hours of sleep do you get in a 24-hour period?
BRFSS_D2b %>%
group_by(SLEPTIM1) %>%
count()
## # A tibble: 26 x 2
## # Groups: SLEPTIM1 [26]
## SLEPTIM1 n
## <dbl> <int>
## 1 1 588
## 2 2 952
## 3 3 2251
## 4 4 9024
## 5 5 21089
## 6 6 70453
## 7 7 96923
## 8 8 91379
## 9 9 13913
## 10 10 7187
## # ... with 16 more rows
The categories IDK/INS (77) ans refused (99) counts for (3552+270)/389181 = 0.98 % of data == > remove
BRFSS_D3 <- BRFSS_D2b %>%
filter(SLEPTIM1 < 77)
BRFSS_D3 %>%
group_by(SLEPTIM1) %>%
count() %>%
pander()
SLEPTIM1 | n |
---|---|
1 | 588 |
2 | 952 |
3 | 2251 |
4 | 9024 |
5 | 21089 |
6 | 70453 |
7 | 96923 |
8 | 91379 |
9 | 13913 |
10 | 7187 |
11 | 449 |
12 | 2223 |
13 | 109 |
14 | 272 |
15 | 226 |
16 | 249 |
17 | 23 |
18 | 124 |
19 | 1 |
20 | 78 |
21 | 8 |
22 | 12 |
23 | 8 |
24 | 32 |
BRFSS_D3 %>%
filter(SLEPTIM1 > 13) %>%
count()
## n
## 1 1033
This number represent 1033/317573 = 0.32%
BRFSS_D3 %>%
filter(SLEPTIM1 < 4) %>%
count()
## n
## 1 3791
This number represent 3791/317573 = 1.19% of respondents.
ggplot(data = BRFSS_D3, aes(SLEPTIM1)) +
geom_bar(stat= "count",
fill= "blue",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of hours",
y = "Frequency",
subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
scale_x_continuous(limits = c(-1, 25), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 18),
axis.title = element_text(size =18 ,face="bold"),
plot.subtitle = element_text(color = "navyblue", size = 18),
plot.caption = element_text(color = "grey",size = 18, face = "italic")
)
boxplot(BRFSS_D3$SLEPTIM1,
ylim= c(0,30),
names = c("days"),
col = c("blue"),
ylab = "Duration (hours)",
border = c("red"),
boxwex = 0.5)
Removing all the record where pp sleep more than 12h and less than 3 hours
BRFSS_D3a <- BRFSS_D3 %>%
filter(SLEPTIM1 < 13 & SLEPTIM1 > 2)
boxplot(BRFSS_D3a$SLEPTIM1,
ylim= c(0,30),
names = c("days"),
col = c("blue"),
main = "Distribution of sleep duration",
ylab = "Duration (s)",
border = c("red"),
boxwex = 0.5)
ggplot(data = BRFSS_D3a, aes(SLEPTIM1)) +
geom_bar(stat= "count",
fill= "blue",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of hours",
y = "Frequency",
subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 12),
axis.title = element_text(size = ,face="bold"),
plot.subtitle = element_text(color = "navyblue", size = 12),
plot.caption = element_text(color = "grey",size = 12, face = "italic")
)
BRFSS_D3a %>%
summarise(mean_slepduratrion = mean(SLEPTIM1),
median_sleepduration = median(SLEPTIM1),
sd_sleepduration = sd(SLEPTIM1))
## mean_slepduratrion median_sleepduration sd_sleepduration
## 1 7.015783 7 1.317316
Testing the normal distribution
qqnorm(BRFSS_D3a$SLEPTIM1, pch = 1, frame = FALSE)
qqline(BRFSS_D3a$SLEPTIM1, col = "steelblue", lwd = 2)
ggplot(BRFSS_D3a, aes(x=SLEPTIM1)) +
geom_histogram(binwidth = 1,
color = "blue",
fill = "white") +
geom_vline(aes(xintercept = mean(SLEPTIM1)),
color = "red",
linetype = "dashed",
size = 1) +
labs(x = "Number of hours",
y = "Frequency",
subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 110000), expand = c(0, 0)) +
scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
theme(
axis.text = element_text(size =14),
axis.title = element_text(size = ,face="bold"),
plot.title = element_text(color = "blue", size = 14, face = "bold"),
plot.subtitle = element_text(color = "navyblue", size = 14),
plot.caption = element_text(color = "grey",size = 14, face = "italic")
)
## Warning: Removed 2 rows containing missing values (geom_bar).
D4. EXERANY2
Exercise in Past 30 Days
BRFSS_D3a %>%
group_by(EXERANY2) %>%
count()
## # A tibble: 4 x 2
## # Groups: EXERANY2 [4]
## EXERANY2 n
## <dbl> <int>
## 1 1 240386
## 2 2 74238
## 3 7 227
## 4 9 40
The categories IDK/INS (77) ans refused (99) counts for (40+227)/314891 = 0.12 % of data ==> remove
BRFSS_D4 <- BRFSS_D3a %>%
filter(EXERANY2 == 1 | EXERANY2 == 2) %>%
mutate(EXERpa30 = case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No"
))
BRFSS_D4 %>%
group_by(EXERpa30) %>%
count() %>%
pander()
EXERpa30 | n |
---|---|
No | 74238 |
Yes | 240386 |
BRFSS_D4 <- subset(BRFSS_D4, select = -c(EXERANY2))
E.Health Status
E1. GENHLTH
BRFSS_E1 <- BRFSS_D4 %>%
filter(GENHLTH < 7)
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 1] <- "Excellent"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 2] <- "Very Good"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 3] <- "Good"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 4] <- "Fair"
BRFSS_E1$GENHLTH[BRFSS_E1$GENHLTH == 5] <- "Poor"
BRFSS_E1 %>%
group_by(GENHLTH) %>%
count()
## # A tibble: 5 x 2
## # Groups: GENHLTH [5]
## GENHLTH n
## <chr> <int>
## 1 Excellent 52772
## 2 Fair 41412
## 3 Good 98141
## 4 Poor 14961
## 5 Very Good 106871
BRFSS_E1$GENHLTH <- factor(BRFSS_E1$GENHLTH,
levels = c("Poor",
"Fair",
"Good",
"Very Good",
"Excellent"), ordered = TRUE)
BRFSS_E1 %>%
group_by(GENHLTH) %>%
count() %>%
pander()
GENHLTH | n |
---|---|
Poor | 14961 |
Fair | 41412 |
Good | 98141 |
Very Good | 106871 |
Excellent | 52772 |
F.Healthy Days (Health Related Quality of Life, HRQL)
F1. PHYSHLTH (Number of Days Physical Health Not Good)
F2. MENTHLTH (Number of Days Mental Health Not Good)
F3. POORHLTH (Poor Physical or Mental Health)- note 205475 recored are missing
E1. PHYSHLTH
Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
BRFSS_E1 %>%
group_by(PHYSHLTH) %>%
count()
## # A tibble: 34 x 2
## # Groups: PHYSHLTH [34]
## PHYSHLTH n
## <dbl> <int>
## 1 1 13842
## 2 2 18297
## 3 3 10596
## 4 4 5547
## 5 5 9454
## 6 6 1607
## 7 7 5505
## 8 8 1052
## 9 9 276
## 10 10 6964
## # ... with 24 more rows
BRFSS_F1 <- BRFSS_E1 %>%
filter(PHYSHLTH < 77 | PHYSHLTH == 88)
BRFSS_F1$PHYSHLTH[BRFSS_F1$PHYSHLTH == 88] <- 0
BRFSS_F1 %>%
group_by(PHYSHLTH) %>%
count()
## # A tibble: 31 x 2
## # Groups: PHYSHLTH [31]
## PHYSHLTH n
## <dbl> <int>
## 1 0 195129
## 2 1 13842
## 3 2 18297
## 4 3 10596
## 5 4 5547
## 6 5 9454
## 7 6 1607
## 8 7 5505
## 9 8 1052
## 10 9 276
## # ... with 21 more rows
ggplot(data = BRFSS_F1, aes(PHYSHLTH)) +
geom_bar(stat= "count",
fill= "purple",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of days",
y = "Frequency",
subtitle = "How many days during the past 30 days was your physical health not good?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 200000), expand = c(0, 0)) +
scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 18),
axis.title = element_text(size = ,face="bold"),
plot.caption = element_text(color = "grey",size = 18, face = "italic")
)
Once again, 10, 15, 20 and 30 seems to be a magic number (as we have seen in the drinking varaible ALC5)
boxplot(BRFSS_F1$PHYSHLTH,
ylim= c(0,30),
names = c("days"),
col = c("blue"),
main = "Distribution of days",
ylab = "Duration (s)",
border = c("red"),
boxwex = 0.5)
regrouping
BRFSS_F1a <- BRFSS_F1 %>%
mutate(PHYSHLTHpa30 = if_else(PHYSHLTH == 0, "0",
if_else(PHYSHLTH >= 1 & PHYSHLTH < 10, "1-9","10-30")))
BRFSS_F1a %>%
group_by(PHYSHLTHpa30) %>%
count() %>%
pander()
PHYSHLTHpa30 | n |
---|---|
0 | 195129 |
1-9 | 66176 |
10-30 | 48681 |
BRFSS_F1a <- subset(BRFSS_F1a, select = -c(PHYSHLTH))
F2. MENTHLTH
(Number of Days Mental Health Not Good)
Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
BRFSS_F1a %>%
group_by(MENTHLTH) %>%
count()
## # A tibble: 33 x 2
## # Groups: MENTHLTH [33]
## MENTHLTH n
## <dbl> <int>
## 1 1 10546
## 2 2 16040
## 3 3 9669
## 4 4 4957
## 5 5 11826
## 6 6 1255
## 7 7 4646
## 8 8 968
## 9 9 137
## 10 10 8723
## # ... with 23 more rows
BRFSS_F2 <- BRFSS_F1a %>%
filter(MENTHLTH < 77 | MENTHLTH == 88)
Converting 88 to 0
BRFSS_F2$MENTHLTH[BRFSS_F2$MENTHLTH == 88] <- 0
Checking
BRFSS_F2 %>%
group_by(MENTHLTH) %>%
count()
## # A tibble: 31 x 2
## # Groups: MENTHLTH [31]
## MENTHLTH n
## <dbl> <int>
## 1 0 203554
## 2 1 10546
## 3 2 16040
## 4 3 9669
## 5 4 4957
## 6 5 11826
## 7 6 1255
## 8 7 4646
## 9 8 968
## 10 9 137
## # ... with 21 more rows
ggplot(data = BRFSS_F2, aes(MENTHLTH)) +
geom_bar(stat= "count",
fill= "red",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of days",
y = "Frequency",
subtitle = "How many days during the past 30 days was your mental health not good?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 250000), expand = c(0, 0)) +
scale_x_continuous(limits = c(-1, 31), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 18),
plot.title = element_text(color = "blue", size = 20, face = "bold"),
plot.subtitle = element_text(color = "navyblue", size = 18),
plot.caption = element_text(color = "grey",size = 18, face = "italic")
)
ggplot(data = BRFSS_F2, aes(MENTHLTH)) +
geom_bar(stat= "count",
fill= "red",
alpha = .75) +
coord_cartesian(clip = "off") +
labs( x = "Number of days",
y = "Frequency",
subtitle = "How many days during the past 30 days was your mental health not good?",
caption = "Data source: BRFSS2018") +
scale_y_continuous(limits = c(0, 20000), expand = c(0, 0)) +
scale_x_continuous(limits = c(1, 31), expand = c(0, 0)) +
theme(
axis.text = element_text(size = 18),
plot.title = element_text(color = "blue", size = 20, face = "bold"),
plot.subtitle = element_text(color = "navyblue", size = 18),
plot.caption = element_text(color = "grey",size = 18, face = "italic")
)
## Warning: Removed 203554 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).
Regrouping
BRFSS_F2a <- BRFSS_F2 %>%
mutate(MENTHLTHpa30 = if_else(MENTHLTH == 0, "0",
if_else(MENTHLTH >= 1 & MENTHLTH < 10, "1-9","10-30")))
BRFSS_F2a %>%
group_by(MENTHLTHpa30) %>%
count() %>%
pander()
MENTHLTHpa30 | n |
---|---|
0 | 203554 |
1-9 | 60044 |
10-30 | 43905 |
BRFSS_F2a <- subset(BRFSS_F2a, select = -c(MENTHLTH))
F3. POORHLTH
BRFSS_F2a %>%
group_by(POORHLTH) %>%
count()
## # A tibble: 34 x 2
## # Groups: POORHLTH [34]
## POORHLTH n
## <dbl> <int>
## 1 1 8603
## 2 2 9562
## 3 3 6094
## 4 4 3368
## 5 5 6546
## 6 6 1101
## 7 7 3198
## 8 8 810
## 9 9 181
## 10 10 5273
## # ... with 24 more rows
sum(is.na(BRFSS_F2a$POORHLTH))
## [1] 147173
We should not use this attribute.
BRFSS_F2a <- subset(BRFSS_F2a, select = -c(POORHLTH))
G. Health Care Access
G1. HLTHPLN1 (Have any health care coverage)
G2. MEDCOST (Could Not See Doctor Because of Cost)
G3. CHECKUP1
G1. HLTHPLN1
(Have any health care coverage) —> HLTHPLN
BRFSS_F2a %>%
group_by(HLTHPLN1) %>%
count()
## # A tibble: 4 x 2
## # Groups: HLTHPLN1 [4]
## HLTHPLN1 n
## <dbl> <int>
## 1 1 283678
## 2 2 23233
## 3 7 459
## 4 9 133
BRFSS_G1 <- BRFSS_F2a %>%
filter(HLTHPLN1 == 1 | HLTHPLN1 == 2) %>%
mutate(HLTHPLN = case_when(
HLTHPLN1 == 1 ~ "Yes",
HLTHPLN1 == 2 ~ "No"))
BRFSS_G1 %>%
group_by(HLTHPLN) %>%
count()
## # A tibble: 2 x 2
## # Groups: HLTHPLN [2]
## HLTHPLN n
## <chr> <int>
## 1 No 23233
## 2 Yes 283678
BRFSS_G1 <- subset(BRFSS_G1, select = -c(HLTHPLN1))
G2.MEDCOST
(Could Not See Doctor Because of Cost) —> MEDICOST
BRFSS_G1 %>%
group_by(MEDCOST) %>%
count()
## # A tibble: 5 x 2
## # Groups: MEDCOST [5]
## MEDCOST n
## <dbl> <int>
## 1 1 30826
## 2 2 275050
## 3 7 359
## 4 9 47
## 5 NA 629
BRFSS_G2 <- BRFSS_G1 %>%
filter(MEDCOST == 1 | MEDCOST == 2) %>%
mutate(MEDICOST= case_when(
MEDCOST == 1 ~ "Yes",
MEDCOST == 2 ~ "No"))
BRFSS_G2 %>%
group_by(MEDICOST) %>%
count()
## # A tibble: 2 x 2
## # Groups: MEDICOST [2]
## MEDICOST n
## <chr> <int>
## 1 No 275050
## 2 Yes 30826
BRFSS_G2 <- subset(BRFSS_G2, select = -c(MEDCOST))
G3. CHECKUP1 —> CHECKUP
BRFSS_G2 %>%
group_by(CHECKUP1) %>%
count()
## # A tibble: 7 x 2
## # Groups: CHECKUP1 [7]
## CHECKUP1 n
## <dbl> <int>
## 1 1 243032
## 2 2 29751
## 3 3 15305
## 4 4 14276
## 5 7 1958
## 6 8 1488
## 7 9 66
BRFSS_G3 <- BRFSS_G2 %>%
filter(CHECKUP1 < 5 | CHECKUP1 == 8) %>%
mutate(CHECKUP = case_when(CHECKUP1 == 1 ~ "<1",
CHECKUP1 == 2 ~ "1-<2",
CHECKUP1 == 3 ~ "2-<5",
CHECKUP1 == 4 ~ "5+",
CHECKUP1 == 8 ~ "5+"
))
BRFSS_G3%>%
group_by(CHECKUP) %>%
count() %>%
pander()
CHECKUP | n |
---|---|
<1 | 243032 |
1-<2 | 29751 |
2-<5 | 15305 |
5+ | 15764 |
BRFSS_G3 <- subset(BRFSS_G3, select = -c(CHECKUP1))
Chronic Health Conditions (G)
H1. ASTHMA3
H2. CHCCOPD1 (Ever told) you have chronic obstructive pulmonary disease, emphysema or chronic bronchitis?
H3. HAVARTH3 Told Have Arthritis
H4. ADDEPEV2 Ever told you had a depressive disorder
H5. CHCKDNY1 (Ever told) you have kidney disease?
H6. DIABETE3 (Ever told) you have diabetes)
H7A. CHCSCNCR (Ever told) you had skin cancer? YES/NO
H7B. CHCOCNCR (Ever told) you had any other types of cancer? YES/NO
H8. CVDSTRK3
H1. ASTHMA3 —> ASTHMA
BRFSS_G3 %>%
group_by(ASTHMA3) %>%
count() %>%
pander
ASTHMA3 | n |
---|---|
1 | 42582 |
2 | 260647 |
7 | 598 |
9 | 25 |
BRFSS_H1 <- BRFSS_G3 %>%
filter(ASTHMA3 < 7) %>%
mutate(ASTHMA = case_when(
ASTHMA3 == 1 ~ "Yes",
ASTHMA3 == 2 ~ "No"))
BRFSS_H1 %>%
group_by(ASTHMA) %>%
count() %>%
pander()
ASTHMA | n |
---|---|
No | 260647 |
Yes | 42582 |
BRFSS_H1 <- subset(BRFSS_H1, select = -c(ASTHMA3))
H2. CHCCOPD1—> PULMOND
BRFSS_H1 %>%
group_by(CHCCOPD1) %>%
count()
## # A tibble: 4 x 2
## # Groups: CHCCOPD1 [4]
## CHCCOPD1 n
## <dbl> <int>
## 1 1 24720
## 2 2 277594
## 3 7 902
## 4 9 13
BRFSS_H2 <- BRFSS_H1 %>%
filter(CHCCOPD1 < 7) %>%
mutate(PULMOND = case_when(CHCCOPD1 == 1 ~ "Yes",
CHCCOPD1 == 2 ~ "No"))
BRFSS_H2 <- subset(BRFSS_H2, select = -c(CHCCOPD1))
H3. HAVARTH3
Told Have Arthritis —> ARTHRITIS
ARTHRITIS (Ever told) you have some form of arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia? (Arthritis diagnoses include: rheumatism, polymyalgia rheumatica; osteoarthritis (not osteporosis); tendonitis, bursitis, bunion, tennis elbow; carpal tunnel syndrome, tarsal tunnel syndrome; joint infection, etc.)
BRFSS_H2 %>%
group_by(HAVARTH3) %>%
count()
## # A tibble: 4 x 2
## # Groups: HAVARTH3 [4]
## HAVARTH3 n
## <dbl> <int>
## 1 1 101580
## 2 2 199636
## 3 7 1086
## 4 9 12
BRFSS_H3 <- BRFSS_H2 %>%
filter(HAVARTH3 < 7) %>%
mutate(ARTHRITIS = case_when(
HAVARTH3 == 1 ~ "Yes",
HAVARTH3 == 2 ~ "No"
))
BRFSS_H3 %>%
group_by(ARTHRITIS) %>%
count() %>%
pander()
ARTHRITIS | n |
---|---|
No | 199636 |
Yes | 101580 |
BRFSS_H3 <- subset(BRFSS_H3, select = -c(HAVARTH3))
H4. ADDEPEV2
Ever told you had a depressive disorder —> DEPRESSION
BRFSS_H3 %>%
group_by(ADDEPEV2) %>%
count()
## # A tibble: 4 x 2
## # Groups: ADDEPEV2 [4]
## ADDEPEV2 n
## <dbl> <int>
## 1 1 56919
## 2 2 243543
## 3 7 694
## 4 9 60
BRFSS_H4 <- BRFSS_H3 %>%
filter(ADDEPEV2 < 7) %>%
mutate(DEPRESSION = case_when(ADDEPEV2 == 1 ~ "Yes",
ADDEPEV2 == 2 ~ "No"))
BRFSS_H4 %>%
group_by(DEPRESSION) %>%
count() %>%
pander ()
DEPRESSION | n |
---|---|
No | 243543 |
Yes | 56919 |
BRFSS_H4 <- subset(BRFSS_H4, select = -c(ADDEPEV2))
H5. CHCKDNY1
(Ever told) you have kidney disease? —> KIDNEY
BRFSS_H4 %>%
group_by(CHCKDNY1) %>%
count()
## # A tibble: 4 x 2
## # Groups: CHCKDNY1 [4]
## CHCKDNY1 n
## <dbl> <int>
## 1 1 10977
## 2 2 288826
## 3 7 649
## 4 9 10
BRFSS_H5 <- BRFSS_H4 %>%
filter(CHCKDNY1 < 7) %>%
mutate(KIDNEY = case_when(CHCKDNY1 == 1 ~ "Yes",
CHCKDNY1 == 2 ~ "No"))
BRFSS_H5 %>%
group_by(KIDNEY) %>%
count() %>%
pander()
KIDNEY | n |
---|---|
No | 288826 |
Yes | 10977 |
BRFSS_H5 <- subset(BRFSS_H5, select = -c(CHCKDNY1))
H6. DIABETE3
(Ever told) you have diabetes) —> DIABETE
BRFSS_H5 %>%
group_by(DIABETE3) %>%
count()
## # A tibble: 6 x 2
## # Groups: DIABETE3 [6]
## DIABETE3 n
## <dbl> <int>
## 1 1 40118
## 2 2 2669
## 3 3 251355
## 4 4 5432
## 5 7 219
## 6 9 10
BRFSS_H6 <- BRFSS_H5 %>%
filter(DIABETE3 < 7) %>%
mutate(DIABETE = case_when(
(DIABETE3 == 1 | DIABETE3 == 2) ~ "Yes",
(DIABETE3 == 3 | DIABETE3 == 4) ~ "No"
))
BRFSS_H6 %>%
group_by(DIABETE) %>%
count()
## # A tibble: 2 x 2
## # Groups: DIABETE [2]
## DIABETE n
## <chr> <int>
## 1 No 256787
## 2 Yes 42787
BRFSS_H6 <- subset(BRFSS_H6, select = -c(DIABETE3))
G7. CHCSCNCR + CHCOCNCR —> CANCER
CHCSCNCR (Ever told) you had skin cancer? YES/NO
CHCOCNCR (Ever told) you had any other types of cancer? YES/NO
BRFSS_H6 %>%
group_by(CHCSCNCR) %>%
count()
## # A tibble: 4 x 2
## # Groups: CHCSCNCR [4]
## CHCSCNCR n
## <dbl> <int>
## 1 1 28173
## 2 2 270829
## 3 7 560
## 4 9 12
BRFSS_H6 %>%
group_by(CHCOCNCR) %>%
count()
## # A tibble: 4 x 2
## # Groups: CHCOCNCR [4]
## CHCOCNCR n
## <dbl> <int>
## 1 1 28799
## 2 2 270320
## 3 7 428
## 4 9 27
BRFSS_H7 <- BRFSS_H6 %>%
filter(CHCSCNCR < 7 & CHCOCNCR < 7) %>%
mutate(CANCER = case_when(
CHCSCNCR == 1 & CHCOCNCR == 1 ~ "Yes",
CHCSCNCR == 1 & CHCOCNCR == 2 ~ "Yes",
CHCSCNCR == 2 & CHCOCNCR == 1 ~ "Yes",
CHCSCNCR == 2 & CHCOCNCR == 2 ~ "No"
))
BRFSS_H7 %>%
group_by(CANCER) %>%
count() %>%
pander()
CANCER | n |
---|---|
No | 248039 |
Yes | 50530 |
(So far) There are 16.92 % of respondents who have at least one cancer.
BRFSS_H7 <- subset(BRFSS_H7, select = -c(CHCSCNCR,CHCOCNCR))
H8.CVDSTRK3 (Ever told) you had a stroke
BRFSS_H7 %>%
group_by(CVDSTRK3) %>%
count()
## # A tibble: 4 x 2
## # Groups: CVDSTRK3 [4]
## CVDSTRK3 n
## <dbl> <int>
## 1 1 11904
## 2 2 286174
## 3 7 484
## 4 9 7
BRFSS_H8 <- BRFSS_H7 %>%
filter(CVDSTRK3 < 7) %>%
mutate(STROKE = case_when(
CVDSTRK3 == 1 ~ "Yes",
CVDSTRK3 == 2 ~ "No"
))
BRFSS_H8 %>%
group_by(STROKE) %>%
count() %>%
pander()
STROKE | n |
---|---|
No | 286174 |
Yes | 11904 |
BRFSS_H8 <- subset(BRFSS_H8, select = -c(CVDSTRK3))
I. Oral Health
I1.LASTDEN4 (Last Visited Dentist or Dental Clinic)
I2.RMVTETH4 (Number of Permanent Teeth Removed)
I1. RMVTETH4 —> RMVTETH
Not including teeth lost for injury or orthodontics, how many of your permanent teeth have been removed because of tooth decay or gum disease?
BRFSS_H8 %>%
group_by(RMVTETH4) %>%
count()
## # A tibble: 7 x 2
## # Groups: RMVTETH4 [7]
## RMVTETH4 n
## <dbl> <int>
## 1 1 87317
## 2 2 34351
## 3 3 18417
## 4 7 4233
## 5 8 153600
## 6 9 159
## 7 NA 1
The category “Refused” has 299 records ==> removed these records
BRFSS_I1 <- BRFSS_H8 %>%
filter(RMVTETH4 == 8 | RMVTETH4 < 4) %>%
mutate(RMVTETH = case_when(
RMVTETH4 == 1 ~ "1-5",
RMVTETH4 == 2 ~ ">= 6",
RMVTETH4 == 3 ~ "All",
RMVTETH4 == 8 ~ "0"
))
BRFSS_I1 %>%
group_by(RMVTETH) %>%
count()
## # A tibble: 4 x 2
## # Groups: RMVTETH [4]
## RMVTETH n
## <chr> <int>
## 1 >= 6 34351
## 2 0 153600
## 3 1-5 87317
## 4 All 18417
BRFSS_I1$RMVTETH <- factor(BRFSS_I1$RMVTETH,
levels= c("0",
"1-5",
">= 6",
"All"), ordered = TRUE)
BRFSS_I1%>%
group_by(RMVTETH) %>%
count() %>%
pander()
RMVTETH | n |
---|---|
0 | 153600 |
1-5 | 87317 |
>= 6 | 34351 |
All | 18417 |
BRFSS_I1 <- subset(BRFSS_I1, select = -c(RMVTETH4))
I2. LASTDEN4
BRFSS_I1 %>%
group_by(LASTDEN4) %>%
count()
## # A tibble: 7 x 2
## # Groups: LASTDEN4 [7]
## LASTDEN4 n
## <dbl> <int>
## 1 1 203716
## 2 2 30679
## 3 3 25735
## 4 4 30912
## 5 7 1409
## 6 8 1164
## 7 9 70
BRFSS_I2 <- BRFSS_I1 %>%
filter(LASTDEN4 < 7| LASTDEN4 == 8) %>%
mutate(LASTDENTV = case_when(
LASTDEN4 == 1 ~ "<1",
LASTDEN4 == 2 ~ "1-<2",
LASTDEN4 == 3 ~ "2-<5",
LASTDEN4 == 4 ~ "5+",
LASTDEN4 == 8 ~ "5+"
))
BRFSS_I2 %>%
group_by(LASTDENTV) %>%
count()
## # A tibble: 4 x 2
## # Groups: LASTDENTV [4]
## LASTDENTV n
## <chr> <int>
## 1 <1 203716
## 2 1-<2 30679
## 3 2-<5 25735
## 4 5+ 32076
BRFSS_I2$LASTDENTV <- factor(BRFSS_I2$LASTDENTV,
levels = c("<1",
"1-<2",
"2-<5",
"5+"), ordered = TRUE)
BRFSS_I2%>%
group_by(LASTDENTV) %>%
count() %>%
pander()
LASTDENTV | n |
---|---|
<1 | 203716 |
1-<2 | 30679 |
2-<5 | 25735 |
5+ | 32076 |
BRFSS_I2 <- subset(BRFSS_I2, select = -c(LASTDEN4))
J. Obesity
J1. X_BMI5CAT—> BMICAT
Four-categories of Body Mass Index (BMI)
1 = underweight, 2 = normal, 3 = overweight, 4 = obese, null = missing.
BRFSS_I2 %>%
group_by(X_BMI5CAT) %>%
count()
## # A tibble: 5 x 2
## # Groups: X_BMI5CAT [5]
## X_BMI5CAT n
## <dbl> <int>
## 1 1 4037
## 2 2 83434
## 3 3 101763
## 4 4 91768
## 5 NA 11204
BRFSS_J1 <- BRFSS_I2 %>%
filter(X_BMI5CAT < 5) %>%
mutate (BMICAT = case_when(X_BMI5CAT == 1 ~ "Underweight",
X_BMI5CAT == 2 ~ "Normal",
X_BMI5CAT == 3 ~ "Overweight",
X_BMI5CAT == 4 ~ "Obese"))
BRFSS_J1 %>%
group_by(BMICAT) %>%
count() %>%
pander()
BMICAT | n |
---|---|
Normal | 83434 |
Obese | 91768 |
Overweight | 101763 |
Underweight | 4037 |
BRFSS_J1$BMICAT <- factor(BRFSS_J1$BMICAT,
levels = c("Underweight",
"Normal",
"Overweight",
"Obese"), ordered = TRUE)
BRFSS_J1 %>%
group_by(BMICAT) %>%
count() %>%
pander()
BMICAT | n |
---|---|
Underweight | 4037 |
Normal | 83434 |
Overweight | 101763 |
Obese | 91768 |
BRFSS_J1 <- subset(BRFSS_J1, select = -c(X_BMI5CAT))
MICHD (Outcome) (K)
K. X_MICHD The outcome —> MICHD
BRFSS_J1 %>%
group_by(X_MICHD) %>%
count()
## # A tibble: 3 x 2
## # Groups: X_MICHD [3]
## X_MICHD n
## <dbl> <int>
## 1 1 24594
## 2 2 254978
## 3 NA 1430
BRFSS_K <- BRFSS_J1 %>%
filter(X_MICHD < 3) %>%
mutate(MICHD = case_when(X_MICHD == 1 ~ "Yes",
X_MICHD == 2 ~ "No"))
BRFSS_K %>%
group_by(MICHD) %>%
count() %>%
pander()
MICHD | n |
---|---|
No | 254978 |
Yes | 24594 |
24713/(24713+255253) = 8.8%
BRFSS_K <- subset(BRFSS_K, select = -c(X_MICHD))
str(BRFSS_K)
## 'data.frame': 279572 obs. of 34 variables:
## $ X_STATE : num 1 1 1 1 1 1 1 1 1 1 ...
## $ IDAY : chr "05" "12" "03" "10" ...
## $ SLEPTIM1 : num 7 5 6 8 6 7 10 6 6 6 ...
## $ GENHLTH : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
## $ MONTH : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AGE : chr "80+" "30-39" "60-69" "70-79" ...
## $ GENDER : chr "Female" "Female" "Male" "Female" ...
## $ RACE : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
## $ RELP : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
## $ EDU : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
## $ EMPL : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
## $ INCOME : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
## $ RENTHOME : chr "Own" "Rent" "Own" "Own" ...
## $ COMMUN : chr "Rural" "Urban" "Urban" "Urban" ...
## $ SMOKESTAT : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
## $ ALCpa30 : chr "0" "1-9" "0" "0" ...
## $ EXERpa30 : chr "No" "Yes" "Yes" "Yes" ...
## $ PHYSHLTHpa30: chr "10-30" "0" "0" "0" ...
## $ MENTHLTHpa30: chr "0" "0" "0" "0" ...
## $ HLTHPLN : chr "Yes" "No" "Yes" "Yes" ...
## $ MEDICOST : chr "No" "Yes" "No" "No" ...
## $ CHECKUP : chr "<1" "1-<2" "<1" "<1" ...
## $ ASTHMA : chr "No" "No" "No" "No" ...
## $ PULMOND : chr "No" "No" "No" "No" ...
## $ ARTHRITIS : chr "Yes" "No" "No" "No" ...
## $ DEPRESSION : chr "No" "No" "No" "No" ...
## $ KIDNEY : chr "No" "No" "No" "No" ...
## $ DIABETE : chr "No" "No" "No" "No" ...
## $ CANCER : chr "Yes" "No" "No" "No" ...
## $ STROKE : chr "No" "No" "No" "No" ...
## $ RMVTETH : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
## $ LASTDENTV : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
## $ BMICAT : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
## $ MICHD : chr "No" "No" "No" "No" ...
Merging with table states
statesUS <- read.csv("state-geocodes-v2015.csv", header=TRUE, sep=",")
str(statesUS)
## 'data.frame': 51 obs. of 3 variables:
## $ ï..FIPS: int 9 23 25 33 44 50 34 36 42 17 ...
## $ Name : chr "Connecticut" "Maine" "Massachusetts" "New Hampshire" ...
## $ Region : chr "Northeast" "Northeast" "Northeast" "Northeast" ...
colnames(statesUS)
## [1] "ï..FIPS" "Name" "Region"
joined_BRFSS <- left_join(BRFSS_K, statesUS,
by = c("X_STATE" = "ï..FIPS"))
str(joined_BRFSS)
## 'data.frame': 279572 obs. of 36 variables:
## $ X_STATE : num 1 1 1 1 1 1 1 1 1 1 ...
## $ IDAY : chr "05" "12" "03" "10" ...
## $ SLEPTIM1 : num 7 5 6 8 6 7 10 6 6 6 ...
## $ GENHLTH : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
## $ MONTH : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AGE : chr "80+" "30-39" "60-69" "70-79" ...
## $ GENDER : chr "Female" "Female" "Male" "Female" ...
## $ RACE : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
## $ RELP : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
## $ EDU : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
## $ EMPL : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
## $ INCOME : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
## $ RENTHOME : chr "Own" "Rent" "Own" "Own" ...
## $ COMMUN : chr "Rural" "Urban" "Urban" "Urban" ...
## $ SMOKESTAT : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
## $ ALCpa30 : chr "0" "1-9" "0" "0" ...
## $ EXERpa30 : chr "No" "Yes" "Yes" "Yes" ...
## $ PHYSHLTHpa30: chr "10-30" "0" "0" "0" ...
## $ MENTHLTHpa30: chr "0" "0" "0" "0" ...
## $ HLTHPLN : chr "Yes" "No" "Yes" "Yes" ...
## $ MEDICOST : chr "No" "Yes" "No" "No" ...
## $ CHECKUP : chr "<1" "1-<2" "<1" "<1" ...
## $ ASTHMA : chr "No" "No" "No" "No" ...
## $ PULMOND : chr "No" "No" "No" "No" ...
## $ ARTHRITIS : chr "Yes" "No" "No" "No" ...
## $ DEPRESSION : chr "No" "No" "No" "No" ...
## $ KIDNEY : chr "No" "No" "No" "No" ...
## $ DIABETE : chr "No" "No" "No" "No" ...
## $ CANCER : chr "Yes" "No" "No" "No" ...
## $ STROKE : chr "No" "No" "No" "No" ...
## $ RMVTETH : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
## $ LASTDENTV : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
## $ BMICAT : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
## $ MICHD : chr "No" "No" "No" "No" ...
## $ Name : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ Region : chr "South" "South" "South" "South" ...
joined_BRFSS <- subset(joined_BRFSS, select = -c(X_STATE))
joined_BRFSS <- subset(joined_BRFSS, select = -c(Name))
colnames(joined_BRFSS)
## [1] "IDAY" "SLEPTIM1" "GENHLTH" "MONTH" "AGE"
## [6] "GENDER" "RACE" "RELP" "EDU" "EMPL"
## [11] "INCOME" "RENTHOME" "COMMUN" "SMOKESTAT" "ALCpa30"
## [16] "EXERpa30" "PHYSHLTHpa30" "MENTHLTHpa30" "HLTHPLN" "MEDICOST"
## [21] "CHECKUP" "ASTHMA" "PULMOND" "ARTHRITIS" "DEPRESSION"
## [26] "KIDNEY" "DIABETE" "CANCER" "STROKE" "RMVTETH"
## [31] "LASTDENTV" "BMICAT" "MICHD" "Region"
joined_BRFSS <- joined_BRFSS %>%
dplyr::select(IDAY,MONTH, Region,
AGE, GENDER, RACE, RELP, EDU,EMPL,INCOME,RENTHOME, COMMUN,
ALCpa30, SMOKESTAT, EXERpa30, SLEPTIM1,
GENHLTH,
MENTHLTHpa30, PHYSHLTHpa30,
HLTHPLN, CHECKUP, MEDICOST,
ARTHRITIS,ASTHMA,CANCER,DEPRESSION, DIABETE,KIDNEY,PULMOND, STROKE,
LASTDENTV,RMVTETH,
BMICAT,
MICHD)
str(joined_BRFSS)
## 'data.frame': 279572 obs. of 34 variables:
## $ IDAY : chr "05" "12" "03" "10" ...
## $ MONTH : Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Region : chr "South" "South" "South" "South" ...
## $ AGE : chr "80+" "30-39" "60-69" "70-79" ...
## $ GENDER : chr "Female" "Female" "Male" "Female" ...
## $ RACE : Ord.factor w/ 6 levels "AIAN"<"Asian"<..: 5 3 5 5 5 3 5 5 5 5 ...
## $ RELP : Ord.factor w/ 4 levels "In relationship"<..: 4 3 2 1 1 2 1 1 1 4 ...
## $ EDU : Ord.factor w/ 4 levels "<8"<"9-12"<"12-15"<..: 4 4 2 4 2 4 2 3 3 2 ...
## $ EMPL : Ord.factor w/ 4 levels "Employed"<"Unemployed"<..: 1 1 3 4 1 1 3 1 2 3 ...
## $ INCOME : Ord.factor w/ 5 levels "<$20K"<"$20K-<$35K"<..: 3 2 1 5 5 5 2 3 2 2 ...
## $ RENTHOME : chr "Own" "Rent" "Own" "Own" ...
## $ COMMUN : chr "Rural" "Urban" "Urban" "Urban" ...
## $ ALCpa30 : chr "0" "1-9" "0" "0" ...
## $ SMOKESTAT : Ord.factor w/ 4 levels "Daily smoker"<..: 4 1 4 4 3 4 3 4 3 4 ...
## $ EXERpa30 : chr "No" "Yes" "Yes" "Yes" ...
## $ SLEPTIM1 : num 7 5 6 8 6 7 10 6 6 6 ...
## $ GENHLTH : Ord.factor w/ 5 levels "Poor"<"Fair"<..: 4 3 5 5 4 3 5 3 4 2 ...
## $ MENTHLTHpa30: chr "0" "0" "0" "0" ...
## $ PHYSHLTHpa30: chr "10-30" "0" "0" "0" ...
## $ HLTHPLN : chr "Yes" "No" "Yes" "Yes" ...
## $ CHECKUP : chr "<1" "1-<2" "<1" "<1" ...
## $ MEDICOST : chr "No" "Yes" "No" "No" ...
## $ ARTHRITIS : chr "Yes" "No" "No" "No" ...
## $ ASTHMA : chr "No" "No" "No" "No" ...
## $ CANCER : chr "Yes" "No" "No" "No" ...
## $ DEPRESSION : chr "No" "No" "No" "No" ...
## $ DIABETE : chr "No" "No" "No" "No" ...
## $ KIDNEY : chr "No" "No" "No" "No" ...
## $ PULMOND : chr "No" "No" "No" "No" ...
## $ STROKE : chr "No" "No" "No" "No" ...
## $ LASTDENTV : Ord.factor w/ 4 levels "<1"<"1-<2"<"2-<5"<..: 1 2 3 1 1 1 1 1 1 2 ...
## $ RMVTETH : Ord.factor w/ 4 levels "0"<"1-5"<">= 6"<..: 2 1 2 1 1 1 3 1 3 2 ...
## $ BMICAT : Ord.factor w/ 4 levels "Underweight"<..: 2 4 3 2 3 3 3 4 3 4 ...
## $ MICHD : chr "No" "No" "No" "No" ...
write.csv(joined_BRFSS,file = "BRFSS2018-V3.csv", row.names= FALSE)