Exploring the BRFSS data

Part 3: Exploratory data analysis

Research quesion 1:

q1 <- brfss2013 %>% select(genhlth, hlthpln1, income2) %>% str()

## 'data.frame':    491775 obs. of  3 variables:
##  $ genhlth : Factor w/ 5 levels "Excellent","Very good",..: 4 3 3 2 3 2 4 3 1 3 ...
##  $ hlthpln1: Factor w/ 2 levels "Yes","No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ income2 : Factor w/ 8 levels "Less than $10,000",..: 7 8 8 7 6 8 NA 6 8 4 ...

All three variables here are categorical variables with multiple levels.

a1 <- as.data.frame(table(brfss2013$genhlth, brfss2013$hlthpln1))
a1 <- dcast(a1, Var1 ~ Var2, value.var="Freq")
a1

##        Var1    Yes    No
## 1 Excellent  77024  8126
## 2 Very good 144394 14168
## 3      Good 130269 19631
## 4      Fair  56882  9599
## 5      Poor  24389  3452

This dataframe shows the count of respondents in terms of their health status and healthcare coverage. I will convert these counts into proportions in order to visualize.

a1 <- a1 %>% mutate(Yes=round(prop.table(a1$Yes),2), 
                  No=round(prop.table(a1$No),2))
a1

##        Var1  Yes   No
## 1 Excellent 0.18 0.15
## 2 Very good 0.33 0.26
## 3      Good 0.30 0.36
## 4      Fair 0.13 0.17
## 5      Poor 0.06 0.06

a1 <- melt(a1, variable.name="Health_Plan", value.name="Percentage")

## Using Var1 as id variables

a1

##         Var1 Health_Plan Percentage
## 1  Excellent         Yes       0.18
## 2  Very good         Yes       0.33
## 3       Good         Yes       0.30
## 4       Fair         Yes       0.13
## 5       Poor         Yes       0.06
## 6  Excellent          No       0.15
## 7  Very good          No       0.26
## 8       Good          No       0.36
## 9       Fair          No       0.17
## 10      Poor          No       0.06

a1_plot <- ggplot(data = a1, aes(x=Health_Plan, y=Percentage))+geom_col(aes(fill=Var1), 
                                                           position = "fill")+
  scale_x_discrete()+coord_flip()
a1_plot +ggtitle("General health status by health care coverage") +
  geom_text(aes(label=scales::percent(Percentage), group=Var1), position = position_fill(vjust = 0.5), size=3)+theme(
    plot.title = element_text(hjust = 0.5, vjust = 3, size=16),
    axis.title.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )+scale_y_discrete(expand=c(0,0.02), labels=scales::percent)+labs(fill="Health status")

We see that over three-quarters (Good+Very good+Excellent) of the respondents reported having good health regardless of their healthcare coverage status.

Let’s explore the health care coverage in terms of income level.

a2 <- as.data.frame(table(brfss2013$hlthpln1, brfss2013$income2))
a2 <- dcast(a2, Var1 ~ Var2, value.var="Freq")
a2

##   Var1 Less than $10,000 Less than $15,000 Less than $20,000 Less than $25,000
## 1  Yes             18732             21143             26695             33312
## 2   No              6551              5558              8061              8295
##   Less than $35,000 Less than $50,000 Less than $75,000 $75,000 or more
## 1             41738             55575             61732          113023
## 2              7024              5824              3414            2771

names(a2)

## [1] "Var1"              "Less than $10,000" "Less than $15,000"
## [4] "Less than $20,000" "Less than $25,000" "Less than $35,000"
## [7] "Less than $50,000" "Less than $75,000" "$75,000 or more"

a2 <- a2 %>% rename(Less_than_10k = "Less than $10,000", 
                  Less_than_15k="Less than $15,000",
                  Less_than_20k="Less than $20,000",
                  Less_than_25k="Less than $25,000",
                  Less_than_35k="Less than $35,000",
                  Less_than_50k="Less than $50,000",
                  Less_than_75k="Less than $75,000",
                  k75k_or_more="$75,000 or more")
a2

##   Var1 Less_than_10k Less_than_15k Less_than_20k Less_than_25k Less_than_35k
## 1  Yes         18732         21143         26695         33312         41738
## 2   No          6551          5558          8061          8295          7024
##   Less_than_50k Less_than_75k k75k_or_more
## 1         55575         61732       113023
## 2          5824          3414         2771

The columns are renamed here.

a2 <- a2 %>% mutate(Less_than_10k = round(prop.table(Less_than_10k),2), 
                  Less_than_15k=round(prop.table(Less_than_15k),2),
                  Less_than_20k=round(prop.table(Less_than_20k),2),
                  Less_than_25k=round(prop.table(Less_than_25k),2),
                  Less_than_35k=round(prop.table(Less_than_35k),2),
                  Less_than_50k=round(prop.table(Less_than_50k),2),
                  Less_than_75k=round(prop.table(Less_than_75k),2),
                  k75k_or_more=round(prop.table(k75k_or_more),2)
                  )
a2

##   Var1 Less_than_10k Less_than_15k Less_than_20k Less_than_25k Less_than_35k
## 1  Yes          0.74          0.79          0.77           0.8          0.86
## 2   No          0.26          0.21          0.23           0.2          0.14
##   Less_than_50k Less_than_75k k75k_or_more
## 1          0.91          0.95         0.98
## 2          0.09          0.05         0.02

a2 <- melt(a2, variable.name="Income_level", value.name="Percentage")

## Using Var1 as id variables

Converted the frequency table into proportions.

a2_plot <- ggplot(data = a2, aes(x=Income_level, y=Percentage))+geom_col(aes(fill=Var1), 
                                                           position = "fill")+
  scale_x_discrete()+coord_flip()
a2_plot +ggtitle("Health care coverage by income level") +
  geom_text(aes(label=scales::percent(Percentage), group=Var1), position = position_fill(vjust = 0.5), size=3)+theme(
    plot.title = element_text(hjust = 0.5, vjust = 3, size=16),
    axis.title.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )+scale_y_discrete(expand=c(0,0.04), labels=scales::percent)+labs(fill="Healthcare plan")

We see that as the income level grows, so does the proportion of respondents having access to healthcare coverage with an exception in the \(15\)K-\(20\)K range. So, the correlation is evident.

Research quesion 2:

brfss2013$sleptim1 <- as.numeric(as.integer(brfss2013$sleptim1))
q2 <- brfss2013 %>% select(genhlth, sleptim1) %>% str()

## 'data.frame':    491775 obs. of  2 variables:
##  $ genhlth : Factor w/ 5 levels "Excellent","Very good",..: 4 3 3 2 3 2 4 3 1 3 ...
##  $ sleptim1: num  NA 6 9 8 6 8 7 6 8 8 ...

No wonder ‘sleep time’ is a numerical variable. Let’s have a look at some of the summary statistics.

b1<- brfss2013 %>% 
  filter(!(is.na(genhlth))) %>%
  filter(!(is.na(sleptim1))) %>%
  filter(!(sleptim1 > 16))

s1 <- b1%>% summarise(sleep_mean = mean(sleptim1), sleep_median = median(sleptim1), 
  sleep_sd = sd(sleptim1), sleep_min = min(sleptim1), sleepmax = max(sleptim1))
s1

##   sleep_mean sleep_median sleep_sd sleep_min sleepmax
## 1   7.042359            7  1.42982         0       16

For simplicity, I’ve removed data points representing sleep time more than \(16\) hours. The average sleep time is roughly \(7\) hours with a standard deviation of \(1.43\) hours.

box_plot <- ggplot(data = b1, aes(x=genhlth, y=sleptim1))+geom_jitter(aes(colour=genhlth))+geom_boxplot(alpha=0.85, na.rm=TRUE, 
                                                                                                               outlier.shape=NA) 
box_plot + xlab("General health status") +ylab("Sleep time")

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

We can see that the median sleep time of the respondents who reported poor health is less than the other four groups. So, health status and sleep time seems to be correlated.

Research quesion 3:

q3 <- brfss2013 %>% select(bphigh4, diabete3, weight2, smokday2, exerany2) %>% str()

## 'data.frame':    491775 obs. of  5 variables:
##  $ bphigh4 : Factor w/ 4 levels "Yes","Yes, but female told only during pregnancy",..: 1 3 3 3 1 1 1 1 3 3 ...
##  $ diabete3: Factor w/ 4 levels "Yes","Yes, but female told only during pregnancy",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ weight2 : Factor w/ 570 levels "",".b","100",..: 154 30 63 31 169 128 9 1 139 73 ...
##  $ smokday2: Factor w/ 3 levels "Every day","Some days",..: 3 NA 2 NA 3 NA 3 1 NA NA ...
##  $ exerany2: Factor w/ 2 levels "Yes","No": 2 1 2 1 2 1 1 1 1 1 ...

head(brfss2013 %>% select(bphigh4, diabete3, weight2, smokday2, exerany2))

##   bphigh4 diabete3 weight2   smokday2 exerany2
## 1     Yes       No     250 Not at all       No
## 2      No       No     127       <NA>      Yes
## 3      No       No     160  Some days       No
## 4      No       No     128       <NA>      Yes
## 5     Yes       No     265 Not at all       No
## 6     Yes       No     225       <NA>      Yes

brfss2013$weight2 <- as.numeric(as.factor(brfss2013$weight2))

Changed weight to a numeric variable.

s2 <- brfss2013%>% summarise(wt_mean = mean(weight2), wt_median = median(weight2), 
  wt_sd = sd(weight2), wt_min = min(weight2), wt_max = max(weight2))
s2

##    wt_mean wt_median    wt_sd wt_min wt_max
## 1 80.21934        73 59.64982      1    570

It looks like not everybody reported their weights in pounds.

Let’s explore the relationship between high blood pressure and smoking habits.

c1 <- brfss2013 %>% 
  filter(!(is.na(smokday2)))

c2 <- as.data.frame(table(c1$smokday2, c1$bphigh4))
c2 <- dcast(c2, Var1 ~ Var2, value.var="Freq")
c2 <- c2[,-c(3,5)]
c2

##         Var1   Yes    No
## 1  Every day 20645 33489
## 2  Some days  8062 13005
## 3 Not at all 67848 67710

c2 <- c2 %>% mutate(Yes = round(prop.table(Yes),2), 
                  No=round(prop.table(No),2))
c2

##         Var1  Yes   No
## 1  Every day 0.21 0.29
## 2  Some days 0.08 0.11
## 3 Not at all 0.70 0.59

c2 <- melt(c2, variable.name="high.bp", value.name="Percentage")

## Using Var1 as id variables

c2

##         Var1 high.bp Percentage
## 1  Every day     Yes       0.21
## 2  Some days     Yes       0.08
## 3 Not at all     Yes       0.70
## 4  Every day      No       0.29
## 5  Some days      No       0.11
## 6 Not at all      No       0.59

c2_plot <- ggplot(data = c2, aes(x=high.bp, y=Percentage, fill=Var1))+geom_col(position = "dodge", colour="black")

c2_plot +ggtitle("High bloodpressure vs smoking habits") +
  geom_text(aes(label=scales::percent(Percentage)), vjust = 1.5, position = position_dodge(.9), size=5, colour="white")+theme(
    plot.title = element_text(hjust = 0.5, vjust = 3, size=16),
    axis.title.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )+scale_y_discrete(expand=c(0,0.04), labels=scales::percent)+xlab("High blood pressure") +labs(fill="Smoking")

Seventy percent (\(70\)%) of the respondents reporting high blood pressure do not smoke at all.

Diabetes vs Exercise

d1 <- brfss2013 %>% 
  filter(!(is.na(exerany2)))

d2 <- as.data.frame(table(d1$diabete3, d1$exerany2))
d2 <- dcast(d2, Var1 ~ Var2, value.var="Freq")
d2 <- d2[-c(2,4),]
d2

##   Var1    Yes    No
## 1  Yes  35003 23434
## 3   No 288417 97793

d2 <- d2 %>% mutate(Yes = round(prop.table(Yes),2), 
                  No=round(prop.table(No),2))
d2

##   Var1  Yes   No
## 1  Yes 0.11 0.19
## 3   No 0.89 0.81

d2 <- melt(d2, variable.name="Exercise", value.name="Percentage")

## Using Var1 as id variables

d2

##   Var1 Exercise Percentage
## 1  Yes      Yes       0.11
## 2   No      Yes       0.89
## 3  Yes       No       0.19
## 4   No       No       0.81

d2_plot <- ggplot(data = d2, aes(x=Exercise, y=Percentage, fill=Var1))+geom_col(position = "dodge", colour="black")

d2_plot +ggtitle("Diabetes vs Exercise") +
  geom_text(aes(label=scales::percent(Percentage)), vjust = 1.5, position = position_dodge(.9), size=5, colour="white")+theme(
    plot.title = element_text(hjust = 0.5, vjust = 3, size=16),
    axis.title.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )+scale_y_discrete(expand=c(0,0.04), labels=scales::percent)+xlab("Exercise")+labs(fill="Diabetes")

\(1\) out of \(10\) respondents who do physical activity reported having diabetes. On the other hand, \(1\) in \(5\) who do not do any physical activity reported having diabetes. So, the diabetes positivity rate among respondents who do not do physical activity is almost twice the rate of respondents who are physically active. Although the correlation between diabetes and exercise is somewhat obvious, we cannot conclude that lack of exercise causes diabetes.

Exploring the BRFSS data

Setup

Load packages

Load data

Part 1: Data

Part 2: Research questions

Part 3: Exploratory data analysis

Let’s explore the health care coverage in terms of income level.

Let’s explore the relationship between high blood pressure and smoking habits.

Diabetes vs Exercise