The Behavioural Risk Factor Surveillance System (BRFSS) is an annual health surveillance system which conducts a telephone survey on preventive health practices and risk factors linked to chronic diseases and other injuries. The dataset contains 491,775 respondents aged 18 years old and older across all 50 States in the US, including the District of Columbia, Puerto Rico, Guam, American Samoa, Federated States of Micronesia, and Palau.
The data should allow us to generalise statistical results to the entire population as random sampling was employed in the data collection. In order to obtain a probability sample of all households, the disproportionate stratified sample (DSS) was used for landline sample, and random sample was used for cellular telephone numbers. Potential bias include non-response bias as some intended respondents may not possess landline or cellphone, as well as incomplete response which may cause the ineffective sample size.
The statistical results could not suggest causality as respondents were not randomly assigned to treatment and control groups, but simply asked to answer health-related questions in order to identify emerging health problems.
Research question 1: What are the health issues across all income levels in the United States?
Research question 2: Is there any association between people diagnosed with high blood pressure and age?
Research question 3: Is there correlation among blood pressure, blood cholesterol, heart attack, stroke, weight, smoking, alcohol consumption, and exercise?
Research question 1: What are the health issues across all income levels in the United States?
levels(brfss2013$X_incomg) <- list("<$35,000" = c("Less than $15,000", "$15,000 to less than $25,000", "$25,000 to less than $35,000"),
"35,000-49,999" = "$35,000 to less than $50,000",
"50,000+" = "$50,000 or more",
"NA" = "Don’t know/Not sure/Missing")
X_incomg<-c("<$35,000", "35,000-49,999","50,000+")
df_out <- data.frame(X_incomg)
columns <- c("addepev2" ,"diabete3","bphigh4", "toldhi2", "cvdinfr4", "cvdcrhd4",
"cvdstrk3", "asthma3", "chcscncr", "chccopd1", "havarth3", "chckidny")
for (ii in columns)
{
print(ii)
df_fil <- brfss2013 %>%
filter(X_incomg != "NA", brfss2013[ii] == c("Yes", "No")) %>%
select(X_incomg, all_of(ii))
df_cnt <- df_fil%>%
count(X_incomg, df_fil[ii])
df_mut <- df_cnt %>%
group_by(X_incomg) %>%
mutate(pct = n/sum(n) * 100)
df_mut <- as.data.frame(df_mut)
df_sel <- df_mut %>%
filter(df_mut[ii] == "Yes") %>%
select(X_incomg, pct)
names(df_sel)[names(df_sel) == "pct"] <- ii
df_out <- df_out %>%
left_join(df_sel, by = "X_incomg")
}
df_out_name <- df_out %>%
select("X_incomg","Depressive Disorder" = "addepev2", "Diabetes" = "diabete3", "High blood pressure" = "bphigh4",
"High blood cholesterol" = "toldhi2", "Heart attack" = "cvdinfr4", "Heart disease" = "cvdcrhd4",
"Stroke" = "cvdstrk3", "Asthma" = "asthma3", "Skin cancer" = "chcscncr", "Pulmonary disease" = "chccopd1",
"Arthritis" = "havarth3", "Kidney disease" = "chckidny")
plot_out <- df_out_name %>%
pivot_longer(!X_incomg, names_to ="disease", values_to="percentage")
plot_out %>%
ggplot(aes(percentage, disease)) +
geom_col(aes(fill = X_incomg), colour = "black", position = position_dodge(width = .5), alpha=1) +
theme_economist() +
labs(title = "Health issues across all income levels in the US",
subtitle = "What are the health issues across all income levels in the United States?
",
caption = "Source: Behavioral Risk Factor Surveillance System 2013",
fill = "Income Levels (USD)",
x = "Percentage",
y = "Types of Disease")Research question 2: Is there any association between people diagnosed with high blood pressure and age?
brfss2013 %>%
filter(bphigh4 == c("Yes", "No"), X_age_g != "NA") %>%
select(bphigh4, X_age_g) %>%
count(bphigh4, X_age_g) %>%
group_by(X_age_g) %>%
mutate(X_age_g_pct2 = n/sum(n) * 100) %>%
ggplot(aes(X_age_g, X_age_g_pct2)) +
geom_col(aes(fill = bphigh4)) +
theme_economist() +
scale_fill_brewer(palette = "RdYlGn") +
labs(title = "High blood pressure across age groups",
subtitle = "Is there any association between people diagnosed with high blood pressure and age?"
,
caption = "Source: Behavioral Risk Factor Surveillance System 2013",
fill = "Have you EVER been told that you have high blood pressure",
x = "",
y = "Percentage")Research question 3: Is there correlation among blood pressure, blood cholesterol, heart attack, stroke, weight, smoking, alcohol consumption, and exercise?
vars <- names(brfss2013) %in% c('bphigh4','weight2', 'exerany2','cvdstrk3','drnk3ge5','smoke100', 'cvdinfr4', 'toldhi2')
selected_df <- brfss2013[vars]
selected_df$bphigh4 <- ifelse(selected_df$bphigh4=="Yes", 1, 0)
selected_df$weight2 <- as.numeric(selected_df$weight2)
selected_df$exerany2 <- ifelse(selected_df$exerany2 =="Yes", 1, 0)
selected_df$cvdstrk3 <- ifelse(selected_df$cvdstrk3 =="Yes", 1, 0)
selected_df$cvdinfr4 <- ifelse(selected_df$cvdinfr4 =="Yes", 1, 0)
selected_df$smoke100 <- ifelse(selected_df$smoke100 =="Yes", 1, 0)
selected_df$toldhi2 <- ifelse(selected_df$toldhi2 =="Yes", 1, 0)
selected_df <- na.omit(selected_df)
selected_df_name <- selected_df %>%
select("High blood pressure" = "bphigh4", "High blood cholesterol" = "toldhi2",
"Heart attack" = "cvdinfr4", "Stroke" = "cvdstrk3", "Weight" = "weight2",
"Smoking" = "smoke100", "Alcohol intake" = "drnk3ge5", "Exercise" = "exerany2")
corr_matrix <- cor(selected_df_name)
corrplot(corr_matrix , method="color",addCoef.col = "grey", type="upper", number.cex = 0.75, tl.cex = 0.65)