Descriptive analysis-Univariate and bivariate analysis

library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## -- Conflicts -------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(ggpol)
library(DataExplorer)
library(knitr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.3

## corrplot 0.84 loaded

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

BRFSS <- read.csv(file="C:/Users/user/Desktop/Capstone Project/Submission/R codes/BRFSS2018-V3.csv", header = TRUE, sep = ",")

plot_intro(BRFSS)

str(BRFSS)

## 'data.frame':    279572 obs. of  34 variables:
##  $ IDAY        : int  5 12 3 10 9 10 10 3 11 12 ...
##  $ MONTH       : Factor w/ 12 levels "Apr","Aug","Dec",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Region      : Factor w/ 4 levels "Midwest ","Northeast",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ AGE         : Factor w/ 7 levels "18-29","30-39",..: 7 2 5 6 4 3 7 4 3 6 ...
##  $ GENDER      : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 1 2 1 1 1 ...
##  $ RACE        : Factor w/ 6 levels "AIAN","Asian",..: 6 3 6 6 6 3 6 6 6 6 ...
##  $ RELP        : Factor w/ 4 levels "In relationship",..: 4 3 2 1 1 2 1 1 1 4 ...
##  $ EDU         : Factor w/ 4 levels "<8","12-15","16+",..: 3 3 4 3 4 3 4 2 2 4 ...
##  $ EMPL        : Factor w/ 4 levels "Employed","Occupations",..: 1 1 3 2 1 1 3 1 4 3 ...
##  $ INCOME      : Factor w/ 5 levels "$20K-<$35K","$35K-<$50K",..: 2 1 5 4 4 4 1 2 1 1 ...
##  $ RENTHOME    : Factor w/ 3 levels "Other","Own",..: 2 3 2 2 2 3 2 2 2 2 ...
##  $ COMMUN      : Factor w/ 3 levels "Rural","Suburban",..: 1 3 3 3 1 3 3 2 2 3 ...
##  $ ALCpa30     : Factor w/ 3 levels "0","1-9","10-30": 1 2 1 1 2 1 1 3 2 1 ...
##  $ SMOKESTAT   : Factor w/ 4 levels "Daily smoker",..: 3 1 3 3 2 3 2 3 2 3 ...
##  $ EXERpa30    : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 1 2 2 1 ...
##  $ SLEPTIM1    : int  7 5 6 8 6 7 10 6 6 6 ...
##  $ GENHLTH     : Factor w/ 5 levels "Excellent","Fair",..: 5 3 1 1 5 3 1 3 5 2 ...
##  $ MENTHLTHpa30: Factor w/ 3 levels "0","1-9","10-30": 1 1 1 1 1 1 1 2 2 1 ...
##  $ PHYSHLTHpa30: Factor w/ 3 levels "0","1-9","10-30": 3 1 1 1 1 2 1 2 2 1 ...
##  $ HLTHPLN     : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 1 2 ...
##  $ CHECKUP     : Factor w/ 4 levels "<1","1-<2","2-<5",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ MEDICOST    : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 2 1 ...
##  $ ARTHRITIS   : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 2 ...
##  $ ASTHMA      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ CANCER      : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 1 ...
##  $ DEPRESSION  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ DIABETE     : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 2 ...
##  $ KIDNEY      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PULMOND     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
##  $ STROKE      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LASTDENTV   : Factor w/ 4 levels "<1","1-<2","2-<5",..: 1 2 3 1 1 1 1 1 1 2 ...
##  $ RMVTETH     : Factor w/ 4 levels ">= 6","0","1-5",..: 3 2 3 2 2 2 1 2 1 3 ...
##  $ BMICAT      : Factor w/ 4 levels "Normal","Obese",..: 1 2 3 1 3 3 3 2 3 2 ...
##  $ MICHD       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

library(ggridges)
library(ggplot2)

1.2. Building Proportion Tables Example

MICHFreq <- table(BRFSS$MICHD)
MICHFreq

## 
##     No    Yes 
## 254978  24594

prop.table(MICHFreq)

## 
##         No        Yes 
## 0.91202982 0.08797018

Graphical data analysis

library(scales)
library(tidyverse)
library(pander)

Defining some sets of color

col2 <- c("#A9A9A9", "#FF8C00")
col3 <- c("#003f5c", "#bc5090","#ffa600")
col4 <- c("#FB6542","#3F681C","#4D585B", "#375E97")
col5 <- c("#BD3E85","#3F681C","#FFBB00", "#FB6542", "#375E97")
col6 <- c("#0072BB", "#FF4C3B","#FFD034","#51A39D","#06425C", "#C6C8CA")
col7 <- c("#0072BB", "#FF4C3B","#FFD034","#51A39D","#06425C", "#C6C8CA","#814374")

The outcome (MICHD)

The characteristics of the sample were examined by calculating frequencies of categorical variables and the mean value of a continuous variable (sleep duration is only a numeric attribute in the data).

The outcome of interest, MICHD, was defined as a “Yes” response to having been “ever diagnosed with” at least one of the following: myocardial infarction/heart attack, angina or coronary heart disease. Of the 279572 individuals included in this analysis, 24594 respondents had MICHD

BRFSS %>% 
  group_by(MICHD) %>% 
  dplyr::count() %>% 
  pander()

MICHD	n
No	254978
Yes	24594

df_MICHD <- BRFSS %>% 
  group_by(MICHD) %>% 
  dplyr::count() %>% 
  ungroup()%>% 
  arrange(desc(MICHD)) %>%
  mutate(Percentage = round(n/sum(n),4)*100,
         lab.pos = cumsum(Percentage)-0.5*Percentage)

ggplot(data = df_MICHD, 
       aes(x = "", 
           y = Percentage, 
           fill = MICHD))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(Percentage,"%", sep = "")), 
            col = "blue",
            size = 6) +
  theme( legend.box = "horizontal",
         legend.text = element_text(color = "black", size = 12))+
  theme_void() +
  scale_fill_manual(values = col2)

Region

library(treemapify)

BRFSS %>% 
  group_by(Region) %>%
  dplyr::count() %>% 
  pander()

Region	n
Midwest	76268
Northeast	57288
South	83856
West	62160

df_region <- BRFSS %>% 
  group_by(Region) %>% 
  dplyr::count() %>% 
  ungroup()%>% 
  arrange(desc(Region)) %>%
  mutate(percentage = round(n/sum(n),4)*100,
         lab.pos = cumsum(percentage)-.5*percentage)

ggplot(data = df_region, 
       aes(x = "", 
           y = percentage, 
           fill = Region))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(percentage,"%", sep = "")), col = "white", size = 5) +
  scale_fill_manual(values = col4) +
  theme_void() +
  theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14))

Demographics (B) B1. AGE B2. GENDER B3. X_RACE B4. RELPT B5. EDU B6. EMPL B7. INCOME1 B8. RENTHOME

Defining theme and geomtext for all bar charts

theme_bar <- theme(axis.text = element_text(size = 12),
                   axis.title = element_text(size = 12, face = "bold"),
                   plot.caption = element_text(color = "grey44",
                                               size = 12,
                                               face = "italic"),
                   legend.text = element_text(colour="black", size = 12))

geomtext_bar <- geom_text(aes(label = paste0(Percentage,"%")), 
                               vjust = -0.3,
                               size = 4,
                               fontface = "bold") 


theme_point <- theme(axis.text = element_text(size = 12, color = "black"),
                    axis.title = element_text(size = 12, face = "bold"),
                    plot.caption = element_text(color = "grey", size = 12, face = "italic"),
                    legend.text = element_text(colour="black", size = 12),
                    legend.position="right")


themebar_stacked <-  theme(axis.text = element_text(size = 12, color = "black"),
                           axis.title = element_text(size = 12, face = "bold"),
                           plot.caption = element_text(color = "grey", size = 12, face = "italic"),
                           legend.text = element_text(colour="black", size = 12),
                           legend.position="right")

themebar_facet <-   theme(axis.text = element_text(size = 12, color = "black"),
        axis.title = element_text(size = 12, face= "bold"),
        plot.title = element_text(color = "black", size = 12, face = "bold.italic"),
        plot.caption = element_text(color = "grey", size = 12, face = "italic"),
        strip.text = element_text(size = 12, color = "black", face = "bold.italic"),
        strip.background = element_rect(fill = "slategray1"),
        legend.text = element_text(colour="black", size = 20),
        legend.position = "none")

B1.AGE

ggplot(BRFSS, aes(x = AGE)) +
  geom_bar(aes(y = (..count..)/sum(..count..)),
           fill = "royalblue4", 
           width = 0.75) +
  scale_y_continuous(labels = percent)+
  geom_text(size = 4.5,
            color= "black",
            aes(y = ((..count..)/sum(..count..)),
                label = scales::percent((..count..)/sum(..count..))),
            stat = "count",
            vjust = -0.3) +
  labs(x = "Age group",
       y = "Percent of respondents",
       caption = "Data source: BRFSS2018") + 
  theme_bar

Other way to make the same graph

BRFSS %>% 
  group_by(AGE) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(Agegroup = factor(AGE),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100)) %>% 
  ggplot(aes(x = AGE, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           fill = "royalblue4",
           width = 0.80) +
  geomtext_bar  +
  labs(x = "Age group",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

More than 60% of participants were older than 50 years

library(ggimage)
library(extrafont)

## Registering fonts with R

loadfonts(device = "win")

df1_age <- data.frame(grp = c("18-29","30-39","40-49","50-59","60-69","70-79","80+"),
                  y = 7:1, 
                  img = rep("./man.png",7), 
                  freq = c(10.52,12.32,13.71,19.05,22.61,15.30,6.48))

df2_age <- df1_age[rep(seq_len(nrow(df1_age)), df1_age$freq), 1:3]

df2_age <- df2_age %>% group_by(grp) %>% mutate(x = row_number())

ggplot(df2_age) + 
  geom_image(aes(x = x, 
                 y = y, 
                 image = img), 
             size = 0.045) + 
  theme_void() +
  geom_text(data = df1_age, 
            aes(y = y + 0.1, 
                label = paste(freq,"%", sep = "")), 
            x = 24,
            fontface = 2, 
            size = 4.5, 
            color = "blue") +
  geom_text(data = df1_age,
            aes(y = y + 0.2,
                label = grp), 
            x = -0.3, 
            fontface = 2, 
            size = 4.5, 
            color = "black") +
  xlim(-1,max(df1_age$freq)+2) + 
  ylim(0,max(df1_age$y)+1)

B2. GENDER

tbl_gender <- table(BRFSS$GENDER)
tbl_gender

## 
## Female   Male 
## 145904 133668

prop.table(tbl_gender)

## 
##    Female      Male 
## 0.5218835 0.4781165

library(png)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

genderselection <- read.table(text="
  Gender Freq
     F   52
     M   48
", header=T)
pcts <- round(prop.table(genderselection$Freq)*100)

# Load png file with man and woman
img <- readPNG("manfemale.png")
h <- dim(img)[1]
w <- dim(img)[2]

# Find the rows where feet starts and head ends
pos1 <- which(apply(img[,,1], 1, function(y) any(y==1)))
mn1 <- min(pos1)
mx1 <- max(pos1)
pospctM <- round((mx1-mn1)*pcts[2]/100+mn1)
pospctF <- round((mx1-mn1)*pcts[1]/100+mn1)

# Fill bodies with a different color according to percentages
imgmtx <- img[h:1,,1]
whitemtx <- (imgmtx==1)
colmtx <- matrix(rep(FALSE,h*w),nrow=h)
midpt <- round(w/2)-10
colmtx[mx1:pospctM,1:midpt] <- TRUE
colmtx[mx1:pospctF,(midpt+1):w] <- TRUE
imgmtx[whitemtx & colmtx] <- 0.5

# Plot matrix using heatmap and print text
labs <- c(paste0(pcts[2], "% Males"),paste0(pcts[1], "% Females"))
ax <- list(ticks = '', 
           showticklabels = FALSE, 
           showgrid = FALSE, 
           zeroline = FALSE)
p <- plot_ly(z = imgmtx, 
             showscale = FALSE, 
             type = 'heatmap', 
             width = 500,  
             height = 500) %>%
     add_text(x = c(100,250), y = c(20,20), type = 'heatmap', mode = "text",
        text = labs, showlegend = FALSE, textfont = list(size = 20, color = "#FFFFFF"), inherit = FALSE) %>%
     layout(xaxis = ax,  yaxis = ax)  
p

B3. RACE

BRFSS$RACE <- factor(BRFSS$RACE,
                      levels = c("AIAN",
                                 "Asian",
                                 "Black",
                                 "Hispanic",
                                 "White",
                                 "Others"), ordered = TRUE)

BRFSS %>% 
  group_by(RACE) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(Agegroup = factor(RACE),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100)) %>% 
  ggplot(aes(x = RACE, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "blue", 
           fill = "royalblue4",
           width = 0.75) +
  geomtext_bar +
  labs(x = "Race",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

The majority of participants are white, followed by black, with very few Asian or American Indians

Alternative graphs

library(treemapify)
library(treemap)

## Warning: package 'treemap' was built under R version 3.6.3

plotdata2 <- BRFSS %>%
  dplyr::count(RACE)

ggplot(plotdata2, 
       aes(fill = RACE, 
           area = n,
           label = RACE)) +
  
  geom_treemap() + 
  geom_treemap_text(colour = "dark blue",
                    place = "centre",
                    fontface = "italic") +
  labs(title = "Race of respondents") +
  
  scale_fill_brewer(palette = "RdYlBu")+
  theme(legend.position = "none")

Regroup AIAN, Asian, Multiracial and other into a group OTHERs

df2_race <- data.frame(
  RACE = c("White", "Black","Hispanic", "Others"),
  Percentage = c(78.96, 7.73, 6.69, 6.62))

df3_race <- df2_race %>%
  arrange(desc(RACE)) %>%
  mutate(lab.pos = cumsum(Percentage),
         centre =  lab.pos - 0.5*Percentage)

ggplot(data = df3_race, 
       aes(x = "", 
           y = Percentage, 
           fill = RACE))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = centre, 
                label = paste(Percentage,"%", sep = "")), 
            col = "white") +
  theme_void() +
  scale_fill_manual(values = col4)

library(ggforce) # for 'geom_arc_bar'

## 
## Attaching package: 'ggforce'

## The following objects are masked from 'package:ggpol':
## 
##     geom_circle, GeomCircle, stat_circle, StatCircle

BRFSS %>% 
  group_by(RACE) %>% 
  dplyr::count()

## # A tibble: 6 x 2
## # Groups:   RACE [6]
##   RACE          n
##   <ord>     <int>
## 1 AIAN       4652
## 2 Asian      5670
## 3 Black     21602
## 4 Hispanic  18717
## 5 White    220761
## 6 Others     8170

df3_race <- data.frame(
  RACE = c("White", "Black","Hispanic", "Others"),
  Value = c(220761, 21602   , 18717, 18492))

df3_race <- df3_race %>% 
  mutate(end = 2 * pi * cumsum(Value)/sum(Value),
         start = lag(end, default = 0),
         middle = 0.5 * (start + end),
         hjust = ifelse(middle > pi, 1, 0),
         vjust = ifelse(middle < pi/2 | middle > 3 * pi/2, 0, 1))

df3_race$Label <- paste(round(((df3_race$Value/sum(df3_race$Value))*100),2),"%")

ggplot(df3_race) + 
  geom_arc_bar(aes(x0 = 0, 
                   y0 = 0, 
                   r0 = 0, 
                   r = 1,
                   start = start, 
                   end = end, 
                   fill = RACE)) +
  scale_fill_manual(values = col4) +
  geom_text(aes(x = 1.05 * sin(middle), 
                y = 1.05 * cos(middle), 
                label = Label,
                hjust = hjust, 
                vjust = vjust)) +
  coord_fixed() +
  scale_x_continuous(limits = c(-1.5, 1.5),  # Adjust so labels are not cut off
                     name = "", breaks = NULL, labels = NULL) +
  scale_y_continuous(limits = c(-1.25,1.25),      # Adjust so labels are not cut off
                     name = "", breaks = NULL, labels = NULL)

xRace <- c(78.96, 7.72, 6.69, 6.63)
labels <- c("White", "Black","Hispanic", "Others")
pct <- xRace/sum(xRace)*100
lbls <- paste(labels, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(xRace,labels = lbls, col= col4)

AIAN 1.66 Asian 2.03 Black 7.73 Hispanic 6.69 White 78.96 Others 2.92

xRace2 <- c(78.96, 7.73, 6.69,1.66, 2.03,  2.92)
labels2 <- c("White", "Black", "Hispanic","AIAN", "Asian",  "Others")
pct2 <- round(xRace2/sum(xRace2)*100,2)
lbls2 <- paste(labels2, pct2) # add percents to labels
lbls2 <- paste(lbls2,"%",sep="") # ad % to labels
pie(xRace,labels = lbls2, col= col6)

B5. EDUCATION(EDU)

BRFSS$EDU <- factor(BRFSS$EDU,
                      levels = c("<8",
                                 "9-12",
                                 "12-15",
                                 "16+"), ordered = TRUE)

BRFSS %>% 
  group_by(EDU) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(EDUgroup = factor(EDU),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100)) %>% 
  ggplot(aes(x = EDU , 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "blue", 
           fill = "royalblue4",
           width = 0.5) +
  geomtext_bar +
  labs(x = "Years of education",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

library(echarts4r)

## Warning: package 'echarts4r' was built under R version 3.6.3

## Welcome to echarts4r
## 
## Docs: echarts4r.john-coene.com

df2_edu <- data.frame(
  x = c("<8","9-12", "12-15", "16+"),
  y = c(1.64, 29.41, 27.99, 40.95))

df2_edu %>% 
  e_charts(x) %>% 
  e_pictorial(y, symbol = paste0("image://","https://1.bp.blogspot.com/-klwxpFekdEQ/XOubIhkalyI/AAAAAAAAHlE/25psl9x4oNkbJoLc2CKTXgV2pEj6tAvigCLcBGAs/s1600/pencil.png", size = 0.001)) %>%
  e_theme("westeros") %>% 
  e_labels(show = TRUE, postion = "top", fontSize = 20, color = "blue")%>%
  e_legend(show = FALSE) %>%
  e_x_axis(splitLine=list(show = FALSE,fontSize = 30)) %>%
  e_y_axis(show=FALSE, splitLine=list(show = FALSE,fontSize = 20))

## Warning in if (deparse(substitute(symbol)) %in% colnames(e$x$data[[1]])) symbol
## <- deparse(substitute(symbol)): the condition has length > 1 and only the first
## element will be used

B6. EMPLOYMENT

BRFSS$EMPL <- factor(BRFSS$EMPL,
                      levels = c("Employed",
                                 "Unemployed",
                                 "Occupations",
                                 "Retired"), ordered = TRUE)

tbl_empl <- table(BRFSS$EMPL)
pro_tbl_empl <- prop.table(tbl_empl)
pro_tbl_empl

## 
##    Employed  Unemployed Occupations     Retired 
##  0.55142861  0.09831099  0.06583993  0.28442047

df_empl <- BRFSS %>% 
  group_by(EMPL) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(EMPLgroup = factor(EMPL),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

df_empl %>% 
  pander()

EMPL	Count	EMPLgroup	Percentage	label
Employed	154164	Employed	55.14	55.1%
Unemployed	27485	Unemployed	9.83	9.8%
Occupations	18407	Occupations	6.58	6.6%
Retired	79516	Retired	28.44	28.4%

ggplot(df_empl, 
       aes(x = EMPL, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "blue", 
           fill = "royalblue4",
           width = 0.5) +
  geomtext_bar +
  labs(x = "Employment status",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

B4. RELP (Relationship STATUS)

BRFSS %>% 
  group_by(RELP) %>% 
  dplyr::count()

## # A tibble: 4 x 2
## # Groups:   RELP [4]
##   RELP                 n
##   <fct>            <int>
## 1 In relationship 161013
## 2 Living apart     44731
## 3 Never married    45149
## 4 Widowed          28679

BRFSS$RELP <- factor(BRFSS$RELP,
                      levels = c("In relationship",
                                 "Living apart",
                                 "Never married",
                                 "Widowed"), ordered = TRUE)

The table is maybe better

df_RELP <- BRFSS %>% 
  group_by(RELP) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(RELPgroup = factor(RELP),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

df_RELP %>% 
  pander()

RELP	Count	RELPgroup	Percentage	label
In relationship	161013	In relationship	57.59	57.59%
Living apart	44731	Living apart	16	16.00%
Never married	45149	Never married	16.15	16.15%
Widowed	28679	Widowed	10.26	10.26%

ggplot(df_RELP, 
       aes(x = RELP, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "blue", 
           fill = "royalblue4",
           width = 0.5) +
  geomtext_bar +
  labs(x = "Relationship status",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

B5. INCOME (LEVELS of INCOME)

BRFSS$INCOME <- factor(BRFSS$INCOME,
                      levels = c("<$20K",
                                 "$20K-<$35K",
                                 "$35K-<$50K",
                                 "$50K-<$75K",
                                 "$75K+"), ordered = TRUE)

df_income <- BRFSS %>% 
  group_by(INCOME) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(Income_range = factor(INCOME),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

df_income %>% 
  pander()

INCOME	Count	Income_range	Percentage	label
<$20K	40024	<$20K	14.32	14.32%
$20K-<$35K	51546	$20K-<$35K	18.44	18.44%
$35K-<$50K	38677	$35K-<$50K	13.83	13.83%
$50K-<$75K	47127	$50K-<$75K	16.86	16.86%
$75K+	102198	$75K+	36.56	36.56%

ggplot(df_income, 
       aes(x = INCOME, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "blue", 
           fill = "royalblue4",
           width = 0.5) +
  geomtext_bar +
  labs(x = "Income range",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

For presentation

library(ggimage)
library(extrafont)

loadfonts(device = "win")

df1Income <- data.frame(grp = c("<$20K","$20K-<$35K","$35K-<$50K","$50K-<$75K","$75K+"),
                  y = 5:1, 
                  img = rep("./coin.jpg",5), 
                  freq=c(14.32,18.44,13.83,16.86,36.56))

df2Income<- df1Income[rep(seq_len(nrow(df1Income)), df1Income$freq), 1:3]

df2Income <- df2Income %>% group_by(grp) %>% mutate(x = row_number())

ggplot(df2Income) + 
  geom_image(aes(x = x, 
                 y = y, 
                 image = img), size = 0.045) + 
  theme_void() +
  geom_text(data = df1Income, aes(y = y + 0.1, label=paste(freq,"%", sep = "")),  x= -0.5,fontface = 2, size = 4, color="blue") +
  geom_text(data = df1Income, aes(y = y-.3, label = grp), x = 0.5, fontface = 2, size = 4, color="black") +
  xlim(-1,max(df1Income$freq)+2) + ylim(0,max(df1Income$y)+1)

Examining the association between income and education level The question we’d like to answer is: is edcuaiton level related to revenue?

ggplot(BRFSS, aes(x= EDU , group = INCOME)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="INCOME") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent)+
  labs(x = "Year of education",
       y = "Percent of respondents",
       caption = "Data source: BRFSS2018") +
  themebar_facet

df_EduIncome <- BRFSS %>% 
  group_by(EDU, INCOME) %>% 
  dplyr::summarise(Count =  n()) 

df_EduIncome

## # A tibble: 20 x 3
## # Groups:   EDU [4]
##    EDU   INCOME     Count
##    <ord> <ord>      <int>
##  1 <8    <$20K       2373
##  2 <8    $20K-<$35K  1322
##  3 <8    $35K-<$50K   439
##  4 <8    $50K-<$75K   216
##  5 <8    $75K+        244
##  6 9-12  <$20K      21070
##  7 9-12  $20K-<$35K 22708
##  8 9-12  $35K-<$50K 12608
##  9 9-12  $50K-<$75K 11564
## 10 9-12  $75K+      14283
## 11 12-15 <$20K      11238
## 12 12-15 $20K-<$35K 16782
## 13 12-15 $35K-<$50K 12896
## 14 12-15 $50K-<$75K 14518
## 15 12-15 $75K+      22825
## 16 16+   <$20K       5343
## 17 16+   $20K-<$35K 10734
## 18 16+   $35K-<$50K 12734
## 19 16+   $50K-<$75K 20829
## 20 16+   $75K+      64846

library(tidyverse)
library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

df2_EduIncome <- ddply(df_EduIncome,.(INCOME),transform, Percentage = Count*100/sum(Count))
df2_EduIncome

##      EDU     INCOME Count Percentage
## 1     <8      <$20K  2373  5.9289426
## 2   9-12      <$20K 21070 52.6434140
## 3  12-15      <$20K 11238 28.0781531
## 4    16+      <$20K  5343 13.3494903
## 5     <8 $20K-<$35K  1322  2.5646995
## 6   9-12 $20K-<$35K 22708 44.0538548
## 7  12-15 $20K-<$35K 16782 32.5573274
## 8    16+ $20K-<$35K 10734 20.8241183
## 9     <8 $35K-<$50K   439  1.1350415
## 10  9-12 $35K-<$50K 12608 32.5981850
## 11 12-15 $35K-<$50K 12896 33.3428136
## 12   16+ $35K-<$50K 12734 32.9239600
## 13    <8 $50K-<$75K   216  0.4583360
## 14  9-12 $50K-<$75K 11564 24.5379506
## 15 12-15 $50K-<$75K 14518 30.8061196
## 16   16+ $50K-<$75K 20829 44.1975937
## 17    <8      $75K+   244  0.2387522
## 18  9-12      $75K+ 14283 13.9758117
## 19 12-15      $75K+ 22825 22.3340966
## 20   16+      $75K+ 64846 63.4513396

ggplot( )+
  geom_bar(data=df2_EduIncome, 
           aes(x = INCOME, 
               y = Percentage,
               fill = EDU),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Salary Bracket") +
  scale_fill_manual(values = col4) +
  scale_y_continuous(breaks = seq(0, 100, 20)) +
  themebar_stacked

ggplot(data=df2_EduIncome, aes(x = INCOME, 
                           y = Percentage, 
                           group = EDU)) +
  geom_line(linetype = "dashed", 
            aes(color = EDU), 
            size = 0.5) +
  geom_point(aes(color = EDU), 
             size = 6) +
  scale_color_manual(values = col4) +
   labs(y = "Percent of respondents (%)", x = "Salary Bracket") +
  theme_point +
  guides(color=guide_legend("Year of education"))

ggplot(BRFSS, aes(x= INCOME , group = EDU)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "INCOME") +
    facet_grid(~ EDU) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

df3_EduIncome <- ddply(df_EduIncome,.(EDU),transform, Percentage = Count*100/sum(Count))
df3_EduIncome

##      EDU     INCOME Count Percentage
## 1     <8      <$20K  2373  51.654332
## 2     <8 $20K-<$35K  1322  28.776665
## 3     <8 $35K-<$50K   439   9.555943
## 4     <8 $50K-<$75K   216   4.701785
## 5     <8      $75K+   244   5.311276
## 6   9-12      <$20K 21070  25.622317
## 7   9-12 $20K-<$35K 22708  27.614218
## 8   9-12 $35K-<$50K 12608  15.332044
## 9   9-12 $50K-<$75K 11564  14.062481
## 10  9-12      $75K+ 14283  17.368939
## 11 12-15      <$20K 11238  14.360010
## 12 12-15 $20K-<$35K 16782  21.444179
## 13 12-15 $35K-<$50K 12896  16.478616
## 14 12-15 $50K-<$75K 14518  18.551221
## 15 12-15      $75K+ 22825  29.165975
## 16   16+      <$20K  5343   4.666946
## 17   16+ $20K-<$35K 10734   9.375819
## 18   16+ $35K-<$50K 12734  11.122757
## 19   16+ $50K-<$75K 20829  18.193491
## 20   16+      $75K+ 64846  56.640987

ggplot( )+
  geom_bar(data=df3_EduIncome, 
           aes(x = EDU, 
               y = Percentage,
               fill = INCOME),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Years of education") +
  scale_fill_manual(values = col5) +
  themebar_stacked

ggplot(data=df3_EduIncome, aes(x = EDU, 
                           y = Percentage, 
                           group = INCOME)) +
  geom_line(linetype = "dashed", 
            aes(color = INCOME), 
            size = 0.5) +
  geom_point(aes(color = INCOME), 
             size = 6) +
  scale_color_manual(values = col5) +
  labs(y = "Percent of respondents (%)", x = "Year of education") +
  theme_point +
  guides(color=guide_legend("Salary Bracket"))

GENDER and INCOME

df_GenderIncome <- BRFSS %>% 
  group_by(GENDER, INCOME) %>% 
  dplyr::summarise(Count =  n()) 

df_GenderIncome %>% 
  pander()

GENDER	INCOME	Count
Female	<$20K	24445
Female	$20K-<$35K	29608
Female	$35K-<$50K	20421
Female	$50K-<$75K	23800
Female	$75K+	47630
Male	<$20K	15579
Male	$20K-<$35K	21938
Male	$35K-<$50K	18256
Male	$50K-<$75K	23327
Male	$75K+	54568

df2_GenderIncome <- ddply(df_GenderIncome,.(INCOME),transform, Percentage = Count*100/sum(Count))
                      
df2_GenderIncome <- df2_GenderIncome %>% 
  mutate(pos = cumsum(Percentage) - (0.5 * Percentage))

df2_GenderIncome %>% 
  pander()

GENDER	INCOME	Count	Percentage	pos
Female	<$20K	24445	61.08	30.54
Male	<$20K	15579	38.92	80.54
Female	$20K-<$35K	29608	57.44	128.7
Male	$20K-<$35K	21938	42.56	178.7
Female	$35K-<$50K	20421	52.8	226.4
Male	$35K-<$50K	18256	47.2	276.4
Female	$50K-<$75K	23800	50.5	325.3
Male	$50K-<$75K	23327	49.5	375.3
Female	$75K+	47630	46.61	423.3
Male	$75K+	54568	53.39	473.3

ggplot( )+
  geom_bar(data = df2_GenderIncome, 
           aes(x = INCOME, 
               y = Percentage,
               fill = GENDER),
               stat = "identity") +
   labs(y = "Percent of respondents (%)", 
        x = "Salary Bracket") +
  scale_fill_manual(values = col2) +
  scale_y_continuous(breaks = seq(0, 100, 20)) +
  themebar_stacked

It is hard to see the difference in this graph

ggplot(BRFSS, aes(x= GENDER , 
                  group = INCOME)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col2) +
    labs(y = "Percent of respondents", fill="GENDER") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(data = df2_GenderIncome, 
       aes(x = INCOME,
           y = Percentage,
           group = GENDER)) +
  geom_line(linetype = "dashed", 
            aes(color = GENDER), 
            size = 0.5) +
  geom_point(aes(color = GENDER), 
             size = 6) +
  scale_color_manual(values = c("red","blue") ) +
  labs(x = "Salary Bracket",
       y = "Percent of respondents (%)") +
  theme_point +
  guides(color=guide_legend("GENDER"))

ggplot(BRFSS, aes(x= INCOME , group = GENDER)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.6) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "INCOME") +
    facet_grid(~  GENDER) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

library(DescTools)

Strong association : AGE, EMPL Moderate to strong : RELP,INCOME Weak to moderate : GENDER, EDU, RACE

Community

df_COMMUN <- BRFSS %>% 
  group_by(COMMUN) %>% 
  dplyr::summarise(Count = n()) %>% 
  ungroup()%>% 
  arrange(desc(COMMUN)) 

df_COMMUN

## # A tibble: 3 x 2
##   COMMUN    Count
##   <fct>     <int>
## 1 Urban    192173
## 2 Suburban  45058
## 3 Rural     42341

df2_COMMUN <- df_COMMUN %>% 
  mutate(Percentage = round(Count/sum(Count),4)*100,
         lab.pos = cumsum(Percentage)-0.5*Percentage)

df2_COMMUN

## # A tibble: 3 x 4
##   COMMUN    Count Percentage lab.pos
##   <fct>     <int>      <dbl>   <dbl>
## 1 Urban    192173       68.7    34.4
## 2 Suburban  45058       16.1    76.8
## 3 Rural     42341       15.1    92.4

geomtext_pie <- geom_text(aes(y = lab.pos,
                              label = paste(Percentage,"%", sep = "")),
                          col = "blue",
                          size = 5)

ggplot(data = df2_COMMUN, 
       aes(x = "", 
           y = Percentage, 
           fill = COMMUN)) +
  geom_bar(stat = "identity") +
  coord_polar("y") +
  geomtext_pie +
  theme_void() +
    theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14)) +
  scale_fill_manual(values = col3)

Xcommun <- c(68.74, 16.12, 15.14)
labels2 <- c("Urban", "Suburban","Rural")
pct2 <- Xcommun/sum(Xcommun)*100
lbls2 <- paste(labels2, pct2) # add percents to labels
lbls2 <- paste(lbls2,"%",sep="") # ad % to labels
pie(Xcommun,labels = lbls2, col= col3)

RENT/OWN

library(dplyr)
library(plyr)
library(pander)

Xownerchip <- c(72.68, 23.55, 3.78)
labels3 <- c("Own", "Rent","Other situation")
pct3 <-Xownerchip /sum(Xownerchip )*100
lbls3 <- paste(labels3, round(pct3,2) )# add percents to labels
lbls3 <- paste(lbls3,"%" ,sep="") # ad % to labels
pie(Xownerchip ,labels = lbls3, col= col3)

Lifestyle (C) Health behaviors

BRFSS %>% 
  group_by(SMOKESTAT) %>% 
  dplyr::count() %>% 
  pander()

SMOKESTAT	n
Daily smoker	29247
Former smoker	80542
Never smoker	158075
Someday smoker	11708

BRFSS$SMOKESTAT <- factor(BRFSS$SMOKESTAT,
                      levels = c("Daily smoker",
                                 "Someday smoker",
                                 "Former smoker",
                                 "Never smoker"), ordered = TRUE)

df_SMOKESTAT <- BRFSS %>% 
  group_by(SMOKESTAT) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(SMOKESTATgroup = factor(SMOKESTAT),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_SMOKESTAT, 
       aes(x = SMOKESTAT, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "olivedrab4", 
           fill = "olivedrab4",
           width = 0.5) +
  scale_y_continuous(breaks = seq(0,60,10)) +
  geomtext_bar +
  labs(x = "Smoker status",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

C4.EXERCISE

df_EXERpa30 <- BRFSS %>% 
  group_by(EXERpa30) %>% 
  dplyr::summarise(Count = n()) %>% 
  ungroup()%>% 
  arrange(desc(EXERpa30)) 

df2_EXERpa30 <- df_EXERpa30 %>% 
  mutate(Percentage = round(Count/sum(Count),4)*100,
         lab.pos = cumsum(Percentage)-0.5*Percentage)

df2_EXERpa30 %>% 
  pander()

EXERpa30	Count	Percentage	lab.pos
Yes	217106	77.66	38.83
No	62466	22.34	88.83

ggplot(data = df2_EXERpa30, 
       aes(x = "", 
           y = Percentage, 
           fill = EXERpa30))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geomtext_pie +
  theme_void() +
  theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14)) +
  scale_fill_manual(values = c("steelblue","olivedrab4"))

C2.ALCpa30

df_ALCpa30 <- BRFSS %>% 
  group_by(ALCpa30) %>% 
  dplyr::summarise(Count = n())

df_ALCpa30 %>% 
  pander()

ALCpa30	Count
0	123079
1-9	98975
10-30	57518

ggplot(df_ALCpa30, 
       aes(x = ALCpa30, 
           y = Count)) +
  geom_bar(stat = "identity",
           color = "olivedrab4", 
           fill = "olivedrab4",
           width = 0.5) +
  geom_text(aes(label=paste0(Count)), 
             vjust = -0.3,
             size = 4,
             color= "black",
             fontface = "bold") +
  labs(x = "# of days where people have at least one drink",
       y = "Frequency",
       caption = "Data source: BRFSS2018") +
  theme_bar

df_ALCpa30 <- BRFSS %>% 
  group_by(ALCpa30) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(ALCpa30group = factor(ALCpa30),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_ALCpa30, 
       aes(x = ALCpa30, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "olivedrab4", 
           fill = "olivedrab4",
           width = 0.45) +
  geomtext_bar +
  labs(x = "Number of days where people have at least one drink",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

SMOKESTAT : strong assocation with the outcome Maybe we should remove all the data that were collected during the past 30 days

C4. SLEPTIM1

library(ggforce)

Numerical Data Descriptive Statistics

ggplot(data = BRFSS, aes(SLEPTIM1)) + 
  geom_bar(stat= "count", 
                 fill= "olivedrab4") +  
  coord_cartesian(clip = "off") +
  labs( x = "Sleep duration (hours)",
        y = "Frequency",
        subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
  theme_bar

summary(BRFSS$SLEPTIM1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   6.000   7.000   7.018   8.000  12.000

ggplot(BRFSS, 
       aes(x = PULMOND, 
           y = SLEPTIM1)) +
  geom_violin(fill = "cornflowerblue") +
  geom_boxplot(width = .2, 
               fill = "orange",
               outlier.color = "orange",
               outlier.size = 2) +
  labs(title = "Sleep duration distribution by PULMOND status")

ggplot(BRFSS, 
       aes(x = MICHD, 
           y = SLEPTIM1, fill = MICHD)) +
  geom_violin(trim = FALSE,fill = "cornflowerblue") +
  geom_boxplot(width = 0.1, 
               fill = "grey",
               outlier.color = "grey",
               outlier.size = 2) +
  labs(title = "Sleep duration distribution by MICHD status")

D1.GENHLTH

BRFSS$GENHLTH <- factor(BRFSS$GENHLTH,
                                levels =  c("Poor",
                                            "Fair",
                                            "Good",
                                            "Very Good",
                                            "Excellent"), ordered = TRUE)

df_GENHLTH <- BRFSS %>% 
  group_by(GENHLTH) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(SMOKESTATgroup = factor(GENHLTH),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_GENHLTH, 
       aes(x = GENHLTH, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "purple4", 
           fill = "purple4",
           width = 0.6) +
  scale_y_continuous(breaks = seq(0,60,10)) +
  geomtext_bar +
  labs(x = "In general, your health is:",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

ggplot(BRFSS, aes(x = GENHLTH, group = INCOME)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "INCOME") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = INCOME , group = GENHLTH)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "GENHLTH") +
    facet_grid(~  GENHLTH) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

Healthy Days (E)— Health Related Quality of Life E1. PHYSHLTH (Number of Days Physical Health Not Good) 3 level not good physical health status: 0 days, 1-9 days, 10-30 days E2. MENTHLTH (Number of Days Mental Health Not Good) 3 level not good mental health status: 0 days, 1-9 days, 10-30 days

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(grid)

p_PHYSHLTH <- ggplot(BRFSS, aes(x = PHYSHLTHpa30)) +
  geom_bar(aes(y = (..count..)/sum(..count..)),
           fill = "purple4",
           width = 0.6) +
  scale_y_continuous(labels = percent,
                     limits = c(0, 0.7))+
  geom_text(size = 5,
            color= "black",
            aes(y = ((..count..)/sum(..count..)),
                label = scales::percent((..count..)/sum(..count..))),
            stat = "count", 
            vjust = -0.5) +
  labs(x = "Physical Health Not Good",
       y = "Percent of Respondents") +
  theme_bar


p_MENTHLTH <- ggplot(BRFSS, aes(x = MENTHLTHpa30)) +
  geom_bar(aes(y = (..count..)/sum(..count..)),
           fill = "purple4",
           width = 0.6) +
  scale_y_continuous(labels = percent,
                     limits = c(0, 0.7))+
  geom_text(size = 5,
            color= "black",
            aes(y = ((..count..)/sum(..count..)),
                label = scales::percent((..count..)/sum(..count..))),
            stat = "count", 
            vjust = -0.5) +
  labs(x = "Mental Health Not Good",
       y = "Percent of Respondents") +
  theme_bar


grid.arrange(p_PHYSHLTH, p_MENTHLTH, ncol = 2)

HRQOL <- data.frame(
  Health = c("Physical","Physical","Physical",
          "Mental","Mental","Mental"),
  NumberofDays = rep(c("0", "1-9", "10-30"),2),
  Percentage = c(63.5, 21.4, 15.1,
                 66.3, 19.7, 14.0 ))
HRQOL %>% 
  pander()

Health	NumberofDays	Percentage
Physical	0	63.5
Physical	1-9	21.4
Physical	10-30	15.1
Mental	0	66.3
Mental	1-9	19.7
Mental	10-30	14

ggplot(data = HRQOL,
       aes(x = Health,
       y = Percentage,
       fill = NumberofDays)) + 
  geom_bar(stat = "identity",
           position = "dodge",
           size = 0.25)+
  scale_y_continuous(limits = c(0, 70))+
  scale_fill_manual(values= col3)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            position = position_dodge(width = 0.9), 
            vjust= -0.25,
            size = 5,
            color = "black") +
  labs(x = "Health related quality of life",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
 theme_bar

ggplot(data = HRQOL,
       aes(x = NumberofDays,
       y = Percentage,
       fill = Health)) + 
  geom_bar(stat = "identity",
           position = "dodge")+
  scale_y_continuous(limits = c(0, 70))+
  scale_fill_manual(values= col2)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25,
            size = 4.5,
            color = "black") +
  labs(x = "Number of days where health is not good",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

ggplot(BRFSS, aes(x = GENHLTH, group = EMPL)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 5, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~  EMPL) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = MENTHLTHpa30, group = EMPL )) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col3) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~  EMPL) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = PHYSHLTHpa30, group = EMPL )) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col3) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~  EMPL) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = GENHLTH, group = EXERpa30)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col5) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = MENTHLTHpa30, group = EXERpa30)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.6) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col3) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = PHYSHLTHpa30, group = EXERpa30)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.6) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col3) +
    labs(y = "Percent of respondents", fill = "EMPL") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

Health Care Access (F) F1. HPLN1 (Have any health care coverage) ==> HPLN F2. MEDCOST (Could Not See Doctor Because of Cost) F3. CHECKUP1

df_HLTHPLN <- BRFSS %>% 
  group_by(HLTHPLN) %>% 
  dplyr::summarise(Count = n()) %>% 
  ungroup()%>% 
  arrange(desc(HLTHPLN)) 

df2_HLTHPLN <- df_HLTHPLN %>% 
  mutate(Percentage = round(Count/sum(Count),4)*100,
         lab.pos = cumsum(Percentage)-0.5*Percentage)

df2_HLTHPLN %>% 
  pander()

HLTHPLN	Count	Percentage	lab.pos
Yes	259509	92.82	46.41
No	20063	7.18	96.41

ggplot(data = df2_HLTHPLN, 
       aes(x = "", 
           y = Percentage, 
           fill = HLTHPLN))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(Percentage,"%", sep = "")), col = "white", size = 4) +
  theme_void() +
  scale_fill_manual(values = c("steelblue","darkgreen")) +
  theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14))

df_MEDICOST <- BRFSS %>% 
  group_by(MEDICOST) %>% 
  dplyr::summarise(Count = n()) %>% 
  ungroup()%>% 
  arrange(desc(MEDICOST)) 

df2_MEDICOST <- df_MEDICOST %>% 
  mutate(percentage = round(Count/sum(Count),4)*100,
         lab.pos = cumsum(percentage)-0.5*percentage)

df2_MEDICOST

## # A tibble: 2 x 4
##   MEDICOST  Count percentage lab.pos
##   <fct>     <int>      <dbl>   <dbl>
## 1 Yes       27236       9.74    4.87
## 2 No       252336      90.3    54.9

ggplot(data = df2_MEDICOST, 
       aes(x = "", 
           y = percentage, 
           fill = MEDICOST))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(percentage,"%", sep = "")), col = "white", size = 4) +
  theme_void() +
  scale_fill_manual(values = c("steelblue","darkgreen"))+
   theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14))

df_CHECKUP <- BRFSS %>% 
  group_by(CHECKUP) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(CHECKUPgroup = factor(CHECKUP),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_CHECKUP, 
       aes(x = CHECKUP, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "darkgreen", 
           fill = "darkgreen",
           width = 0.6) +
  scale_y_continuous(breaks = seq(0,100,20)) +
  geomtext_bar +
  labs(x = "Last visited a doctor for a routine checkup (Year)",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

BMICAT

BRFSS$BMICAT <- factor(BRFSS$BMICAT,
                            levels = c("Underweight",
                                       "Normal",
                                       "Overweight",
                                       "Obese"), ordered = TRUE)

df_BMICAT <- BRFSS %>% 
  group_by(BMICAT) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(BMICATgroup = factor(BMICAT),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_BMICAT, 
       aes(x = BMICAT, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "firebrick4", 
           fill = "firebrick4",
           width = 0.6) +
  scale_y_continuous(breaks = seq(0,50,10)) +
  geomtext_bar +
  labs(x = "BMI categories",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

BMICAT and EDUCATION

ggplot(BRFSS, aes(x= BMICAT , group = EDU)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill = "BMICATE") +
    facet_grid(~ EDU) +
    scale_y_continuous(labels = scales::percent) +
  theme_bar

df_EduBMI <- BRFSS %>% 
  group_by(EDU, BMICAT) %>% 
  dplyr::summarise(Count =  n()) 

df2_EduBMI <- ddply(df_EduBMI,.(EDU),transform, Percentage = Count*100/sum(Count))

df2_EduBMI

##      EDU      BMICAT Count Percentage
## 1     <8 Underweight    75   1.632564
## 2     <8      Normal  1065  23.182412
## 3     <8  Overweight  1711  37.244232
## 4     <8       Obese  1743  37.940792
## 5   9-12 Underweight  1449   1.762066
## 6   9-12      Normal 21689  26.375056
## 7   9-12  Overweight 29002  35.268080
## 8   9-12       Obese 30093  36.594798
## 9  12-15 Underweight  1086   1.387700
## 10 12-15      Normal 21402  27.347653
## 11 12-15  Overweight 27732  35.436180
## 12 12-15       Obese 28039  35.828467
## 13   16+ Underweight  1400   1.222857
## 14   16+      Normal 38916  33.991929
## 15   16+  Overweight 42820  37.401953
## 16   16+       Obese 31350  27.383261

ggplot(data = df2_EduBMI, 
       aes(x = EDU, 
           y = Percentage,
           group = BMICAT)) +
  geom_line(linetype = "dashed", 
            aes(color = BMICAT), 
            size = 0.5) +
  geom_point(aes(color = BMICAT), 
             size = 6) +
  scale_color_manual(values = col4) +
  labs(x = "Education year",
       y = "Percent of respondents (%)") +
  theme_point +
  guides(color=guide_legend("Education year"))

ggplot(BRFSS, aes(x =  AGE, group =  BMICAT)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col7) +
    labs(y = "Percent of respondents", fill = "EDU") +
    facet_grid(~ BMICAT ) +
    scale_y_continuous(labels = scales::percent)+
  theme_bar

df_BMIage <- BRFSS %>% 
  group_by(AGE, BMICAT) %>% 
  dplyr::summarise(Count =  n()) 

df_BMIage

## # A tibble: 28 x 3
## # Groups:   AGE [7]
##    AGE   BMICAT      Count
##    <fct> <ord>       <int>
##  1 18-29 Underweight   892
##  2 18-29 Normal      12996
##  3 18-29 Overweight   8765
##  4 18-29 Obese        6762
##  5 30-39 Underweight   431
##  6 30-39 Normal      10903
##  7 30-39 Overweight  11740
##  8 30-39 Obese       11376
##  9 40-49 Underweight   350
## 10 40-49 Normal      10107
## # ... with 18 more rows

ggplot(data=df_BMIage, aes(x=AGE, 
                           y = Count, 
                           group=BMICAT)) +
  geom_line(linetype="dashed", 
            aes(color = BMICAT), 
            size=1) +
  geom_point(aes(color = BMICAT), 
             size= 6) +
  scale_color_manual(values= col4) +
  scale_y_continuous(breaks = seq(0,50000,10000)) +
  labs(x = "Age group",
       y = "Frequency",
       caption = "Data source: BRFSS2018") +
  theme_point

How to calculate proportion by age group?

df2_BMIage <- ddply(df_BMIage,.(AGE),transform, Percentage = Count*100/sum(Count))
df2_BMIage

##      AGE      BMICAT Count Percentage
## 1  18-29 Underweight   892  3.0324664
## 2  18-29      Normal 12996 44.1815400
## 3  18-29  Overweight  8765 29.7977223
## 4  18-29       Obese  6762 22.9882713
## 5  30-39 Underweight   431  1.2510885
## 6  30-39      Normal 10903 31.6487663
## 7  30-39  Overweight 11740 34.0783745
## 8  30-39       Obese 11376 33.0217707
## 9  40-49 Underweight   350  0.9128847
## 10 40-49      Normal 10107 26.3615023
## 11 40-49  Overweight 13354 34.8304643
## 12 40-49       Obese 14529 37.8951487
## 13 50-59 Underweight   533  1.0008826
## 14 50-59      Normal 13463 25.2812048
## 15 50-59  Overweight 19398 36.4261168
## 16 50-59       Obese 19859 37.2917958
## 17 60-69 Underweight   734  1.1610985
## 18 60-69      Normal 16629 26.3050494
## 19 60-69  Overweight 23827 37.6914072
## 20 60-69       Obese 22026 34.8424450
## 21 70-79 Underweight   603  1.4092736
## 22 70-79      Normal 11924 27.8676264
## 23 70-79  Overweight 17015 39.7658222
## 24 70-79       Obese 13246 30.9572777
## 25   80+ Underweight   467  2.5786858
## 26   80+      Normal  7050 38.9287686
## 27   80+  Overweight  7166 39.5692987
## 28   80+       Obese  3427 18.9232468

ggplot(data=df2_BMIage, aes(x=AGE, 
                           y = Percentage, 
                           group=BMICAT)) +
  geom_line(linetype  = "dashed", 
            aes(color = BMICAT), 
            size = 0.5) +
  geom_point(aes(color = BMICAT), 
             size= 6) +
  scale_color_manual(values= col4) +
   labs(y = "Percent of respondents (%)", x = "Age range") +
        theme_point

**INCOME and BMI

df_BMIincome <- BRFSS %>% 
  group_by(INCOME, BMICAT) %>% 
  dplyr::summarise(Count =  n())

df2_BMIincome <- ddply(df_BMIincome,.(INCOME),transform, Percentage = Count*100/sum(Count))
df2_BMIincome

##        INCOME      BMICAT Count Percentage
## 1       <$20K Underweight  1048  2.6184289
## 2       <$20K      Normal 11416 28.5228863
## 3       <$20K  Overweight 12299 30.7290626
## 4       <$20K       Obese 15261 38.1296222
## 5  $20K-<$35K Underweight   955  1.8527141
## 6  $20K-<$35K      Normal 14914 28.9333799
## 7  $20K-<$35K  Overweight 17685 34.3091607
## 8  $20K-<$35K       Obese 17992 34.9047453
## 9  $35K-<$50K Underweight   524  1.3548104
## 10 $35K-<$50K      Normal 10930 28.2596892
## 11 $35K-<$50K  Overweight 14057 36.3445976
## 12 $35K-<$50K       Obese 13166 34.0409029
## 13 $50K-<$75K Underweight   498  1.0567191
## 14 $50K-<$75K      Normal 13355 28.3383199
## 15 $50K-<$75K  Overweight 17550 37.2397988
## 16 $50K-<$75K       Obese 15724 33.3651622
## 17      $75K+ Underweight   985  0.9638153
## 18      $75K+      Normal 32457 31.7589385
## 19      $75K+  Overweight 39674 38.8207206
## 20      $75K+       Obese 29082 28.4565256

ggplot(data=df2_BMIincome, aes(x= INCOME, 
                           y = Percentage, 
                           group=BMICAT)) +
  geom_line(linetype="dashed", 
            aes(color = BMICAT), 
            size = 0.5) +
  geom_point(aes(color =BMICAT), 
             size= 6) +
  scale_color_manual(values=  col4) +
   labs(y = "Percent of respondents (%)", x = "Income bracket") +
  theme_point

Overweight increase with income, from 30% for 20K to 38% for 75K+ Obsese decreses with income, from 36.9 % for 20 K to 28.9 for 75K+ The normal weight is similar for all The underweight decreases from 2.5% to 1 %.

ggplot(data=df2_BMIincome, aes(x= BMICAT, 
                           y = Percentage, 
                           group = INCOME)) +
  geom_line(linetype="dashed", 
            aes(color = INCOME), 
            size = 0.5) +
  geom_point(aes(color = INCOME), 
             size= 6) +
  scale_color_manual(values=  col5) +
   labs(y = "Percent of respondents (%)", x = "BMI categories") +
  theme_point

df3_BMIincome <- ddply(df_BMIincome,.(BMICAT),transform, Percentage = Count*100/sum(Count))
df3_BMIincome

##        INCOME      BMICAT Count Percentage
## 1       <$20K Underweight  1048   26.13466
## 2  $20K-<$35K Underweight   955   23.81546
## 3  $35K-<$50K Underweight   524   13.06733
## 4  $50K-<$75K Underweight   498   12.41895
## 5       $75K+ Underweight   985   24.56359
## 6       <$20K      Normal 11416   13.74230
## 7  $20K-<$35K      Normal 14914   17.95310
## 8  $35K-<$50K      Normal 10930   13.15726
## 9  $50K-<$75K      Normal 13355   16.07642
## 10      $75K+      Normal 32457   39.07093
## 11      <$20K  Overweight 12299   12.14536
## 12 $20K-<$35K  Overweight 17685   17.46408
## 13 $35K-<$50K  Overweight 14057   13.88140
## 14 $50K-<$75K  Overweight 17550   17.33077
## 15      $75K+  Overweight 39674   39.17839
## 16      <$20K       Obese 15261   16.72897
## 17 $20K-<$35K       Obese 17992   19.72266
## 18 $35K-<$50K       Obese 13166   14.43245
## 19 $50K-<$75K       Obese 15724   17.23650
## 20      $75K+       Obese 29082   31.87942

ggplot(data=df3_BMIincome, 
       aes(x= BMICAT, 
           y = Percentage,
           group = INCOME)) +
  geom_line(linetype = "dashed", 
            aes(color = INCOME), 
            size = 0.5) +
  geom_point(aes(color = INCOME), 
             size= 6) +
  scale_color_manual(values= col5) +
   labs(x = "BMI categories",
        y = "Percent of respondents (%)") +
     theme_point

ggplot(data = df3_BMIincome, 
       aes(x = INCOME, 
           y = Percentage,
           group = BMICAT)) +
  geom_line(linetype = "dashed", 
            aes(color = BMICAT), 
            size = 0.5) +
  geom_point(aes(color = BMICAT), 
             size= 6) +
  scale_color_manual(values= col5) +
   labs(x = "Income range",
        y = "Percent of respondents (%)") +
     theme_point

df_INCOMEBMI <- BRFSS %>% 
  group_by(INCOME, BMICAT) %>% 
  dplyr::summarise(Count =  n()) 

df_INCOMEBMI

## # A tibble: 20 x 3
## # Groups:   INCOME [5]
##    INCOME     BMICAT      Count
##    <ord>      <ord>       <int>
##  1 <$20K      Underweight  1048
##  2 <$20K      Normal      11416
##  3 <$20K      Overweight  12299
##  4 <$20K      Obese       15261
##  5 $20K-<$35K Underweight   955
##  6 $20K-<$35K Normal      14914
##  7 $20K-<$35K Overweight  17685
##  8 $20K-<$35K Obese       17992
##  9 $35K-<$50K Underweight   524
## 10 $35K-<$50K Normal      10930
## 11 $35K-<$50K Overweight  14057
## 12 $35K-<$50K Obese       13166
## 13 $50K-<$75K Underweight   498
## 14 $50K-<$75K Normal      13355
## 15 $50K-<$75K Overweight  17550
## 16 $50K-<$75K Obese       15724
## 17 $75K+      Underweight   985
## 18 $75K+      Normal      32457
## 19 $75K+      Overweight  39674
## 20 $75K+      Obese       29082

ggplot(data = df_INCOMEBMI , aes(x = INCOME, 
                           y = Count, 
                           group = BMICAT)) +
  geom_line(linetype = "dashed", 
            aes(color = BMICAT), 
            size = 0.5) +
  geom_point(aes(color = BMICAT), 
             size= 6) +
  scale_color_manual(values = col4) +
   scale_y_continuous(breaks = seq(0,20000,5000)) +
  labs(x = "Income braket",
       y = "Frequency",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12, color = "black"),
        axis.title = element_text(size = 12, face="bold"),
        legend.position = "none",
        plot.caption = element_text(color = "grey", size = 12, face = "italic")) +
  theme(legend.position="top")

by using face_grid (margin = true), we will obtain an grid of total

ggplot(BRFSS, aes(x= BMICAT, group = RACE)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat = "count",
             width = 0.6) +
    geom_text(size = 4.5, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat = "count",
              color = "black",
              vjust = -0.5) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", 
         x = "BMI categories") +
    facet_grid(~ RACE) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

Table is maybe the best option here

df_BMIrace <- BRFSS %>% 
  group_by(RACE, BMICAT) %>% 
  dplyr::summarise(Count =  n())

df2_BMIrace <- ddply(df_BMIrace,.(RACE),transform, Percentage = Count*100/sum(Count))

df2_BMIrace %>% 
  pander()

RACE	BMICAT	Count	Percentage
AIAN	Underweight	76	1.634
AIAN	Normal	1045	22.46
AIAN	Overweight	1567	33.68
AIAN	Obese	1964	42.22
Asian	Underweight	187	3.298
Asian	Normal	2737	48.27
Asian	Overweight	1962	34.6
Asian	Obese	784	13.83
Black	Underweight	253	1.171
Black	Normal	4577	21.19
Black	Overweight	7269	33.65
Black	Obese	9503	43.99
Hispanic	Underweight	248	1.325
Hispanic	Normal	4871	26.02
Hispanic	Overweight	7073	37.79
Hispanic	Obese	6525	34.86
White	Underweight	3121	1.414
White	Normal	67495	30.57
White	Overweight	80612	36.52
White	Obese	69533	31.5
Others	Underweight	125	1.53
Others	Normal	2347	28.73
Others	Overweight	2782	34.05
Others	Obese	2916	35.69

Obese : AIAN, Black, Multiracial Obese + Overweight : AsiaN IS THE LEASt UNDERWEIGHT : ASIAN

ggplot(BRFSS, aes(x= GENDER , group = BMICAT)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4.5, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col2) +
    labs(y = "Percent of respondents", fill="GENDER") +
    facet_grid(~ BMICAT) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

BRFSS %>% 
  filter(BMICAT == "Obese") %>% 
  group_by(GENDER) %>% 
  dplyr::count()

## # A tibble: 2 x 2
## # Groups:   GENDER [2]
##   GENDER     n
##   <fct>  <int>
## 1 Female 47853
## 2 Male   43372

BRFSS %>% 
  filter(BMICAT == "Underweight") %>% 
  group_by(GENDER) %>% 
  dplyr::count()

## # A tibble: 2 x 2
## # Groups:   GENDER [2]
##   GENDER     n
##   <fct>  <int>
## 1 Female  2780
## 2 Male    1230

df_BMIgender<- BRFSS %>% 
  group_by(GENDER, BMICAT) %>% 
  dplyr::summarise(Count =  n())

df2_BMIgender <- ddply(df_BMIgender,.(GENDER),transform, Percentage = Count*100/sum(Count))

df2_BMIgender  %>% 
  pander()

GENDER	BMICAT	Count	Percentage
Female	Underweight	2780	1.905
Female	Normal	50141	34.37
Female	Overweight	45130	30.93
Female	Obese	47853	32.8
Male	Underweight	1230	0.9202
Male	Normal	32931	24.64
Male	Overweight	56135	42
Male	Obese	43372	32.45

#This is the best graph EVER
ggplot(df2_BMIgender, aes(BMICAT, Percentage, fill = GENDER)) + 
  geom_bar(position="dodge",
           stat="identity")+
   scale_fill_manual(values= col2) +
      labs(x = "BMI categories", 
           y = "Percent of Respondents (%)",
        caption = "Data source: BRFSS2018") +
   geom_text(size = 4.5,
            color= "blue",
            aes(y = Percentage,
                label = paste0(round(Percentage,1),"%")),
            stat = "identity", 
            position = position_dodge2(width = 0.9, preserve = "single"), 
            vjust= -0.5, 
            hjust= 0.5) +
  themebar_facet

ggplot(BRFSS, aes(x = BMICAT, group = GENDER)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 5, 
              aes(label = scales::percent(..prop..),
                  y = ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = 0.5,
              hjust = -0.1) +
  scale_fill_manual(values= col4) +
    labs(x= "Categories of BMI",
      y = "Percent of respondents") +
    facet_grid(~ GENDER) +
  coord_flip() +
    scale_y_continuous(labels = scales::percent,limits = c(0,0.5))+
  themebar_facet

Amomg male group, 41% of them are overweight while those of female are only 31% 2 % of female are underweight, this number decrease to 1 % for male.

ggplot(BRFSS, aes(x = BMICAT, group = GENHLTH)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4.5, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="GENDER") +
    facet_grid(~ GENHLTH) +
    scale_y_continuous(labels = scales::percent)+
  theme(axis.text = element_text(size = 12, color = "black", angle = 90),
        axis.title = element_text(size = 12, face= "bold"),
        plot.title = element_text(color = "black", size = 12, face = "bold.italic"),
        plot.caption = element_text(color = "grey", size = 12, face = "italic"),
        strip.text = element_text(size = 12, color = "black", face = "bold.italic"),
        strip.background = element_rect(fill = "slategray1"),
        legend.text = element_text(colour="black", size = 20),
        legend.position = "none")

ggplot(BRFSS, aes(x =  BMICAT, group = EXERpa30  )) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.75) +
    geom_text(size = 4.5, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="GENDER") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ORAL HEALTH H1. RMVTETH

BRFSS$RMVTETH <- factor(BRFSS$RMVTETH,
  levels= c("0",
            "1-5",
            ">= 6",
            "All"), ordered = TRUE)

df_RMVTETH <- BRFSS %>% 
  group_by(RMVTETH) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(CHECKUPgroup = factor(RMVTETH),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_RMVTETH, 
       aes(x = RMVTETH, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "darkgreen", 
           fill = "darkgreen",
           width = 0.6) +
  scale_y_continuous(breaks = seq(0,50,10)) +
  geomtext_bar +
  labs(x = "Number of teeth that have been removed",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

ggplot(BRFSS, aes(x= RMVTETH , group = AGE)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ AGE) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

df_RMVTETH <- BRFSS %>%
  group_by(RMVTETH) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(Rmvteth = factor(RMVTETH),
         Ratio = Count/sum(Count),
         label = percent(Ratio %>% round(4)))

df_RMVTETH %>% 
  pander()

RMVTETH	Count	Rmvteth	Ratio	label
0	146320	0	0.5234	52.3%
1-5	83126	1-5	0.2973	29.7%
>= 6	32768	>= 6	0.1172	11.7%
All	17358	All	0.06209	6.2%

Age in function of of teeth removed

df_RmvtethAge <- BRFSS %>% 
  group_by(AGE, RMVTETH) %>% 
  dplyr::summarise(Count =  n()) 

df2_RmvtethAge <- ddply(df_RmvtethAge, .(RMVTETH), transform, Percentage = Count*100/sum(Count))

df2_RmvtethAge

##      AGE RMVTETH Count Percentage
## 1  18-29       0 25312 17.2990705
## 2  30-39       0 24595 16.8090487
## 3  40-49       0 24450 16.7099508
## 4  50-59       0 27550 18.8285949
## 5  60-69       0 26068 17.8157463
## 6  70-79       0 13598  9.2933297
## 7    80+       0  4747  3.2442592
## 8  18-29     1-5  3667  4.4113755
## 9  30-39     1-5  8150  9.8043933
## 10 40-49     1-5 10597 12.7481173
## 11 50-59     1-5 16824 20.2391550
## 12 60-69     1-5 22288 26.8123090
## 13 70-79     1-5 15391 18.5152660
## 14   80+     1-5  6209  7.4693838
## 15 18-29    >= 6   358  1.0925293
## 16 30-39    >= 6  1322  4.0344238
## 17 40-49    >= 6  2492  7.6049805
## 18 50-59    >= 6  6100 18.6157227
## 19 60-69    >= 6  9718 29.6569824
## 20 70-79    >= 6  8527 26.0223389
## 21   80+    >= 6  4251 12.9730225
## 22 18-29     All    78  0.4493605
## 23 30-39     All   383  2.2064754
## 24 40-49     All   801  4.6145869
## 25 50-59     All  2779 16.0099090
## 26 60-69     All  5142 29.6232285
## 27 70-79     All  5272 30.3721627
## 28   80+     All  2903 16.7242770

col7 <-  c("#0072BB", "#FF4C3B","#FFD034","#51A39D","#06425C", "#C6C8CA","#814374")

ggplot( )+
  geom_bar(data = df2_RmvtethAge, 
           aes(x = RMVTETH, 
               y = Percentage,
               fill = AGE),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Number of teeth removed") +
  scale_fill_manual(values = col7) +
  scale_y_continuous(breaks = seq(0, 100, 20)) +
  themebar_stacked

df2_AgeRmvteth <- ddply(df_RmvtethAge ,.(AGE),transform, Percentage = Count*100/sum(Count))
df2_AgeRmvteth

##      AGE RMVTETH Count Percentage
## 1  18-29       0 25312 86.0513344
## 2  18-29     1-5  3667 12.4664287
## 3  18-29    >= 6   358  1.2170661
## 4  18-29     All    78  0.2651708
## 5  30-39       0 24595 71.3933237
## 6  30-39     1-5  8150 23.6574746
## 7  30-39    >= 6  1322  3.8374456
## 8  30-39     All   383  1.1117562
## 9  40-49       0 24450 63.7715180
## 10 40-49     1-5 10597 27.6395409
## 11 40-49    >= 6  2492  6.4997392
## 12 40-49     All   801  2.0892019
## 13 50-59       0 27550 51.7341746
## 14 50-59     1-5 16824 31.5925863
## 15 50-59    >= 6  6100 11.4547537
## 16 50-59     All  2779  5.2184853
## 17 60-69       0 26068 41.2363958
## 18 60-69     1-5 22288 35.2568970
## 19 60-69    >= 6  9718 15.3726905
## 20 60-69     All  5142  8.1340167
## 21 70-79       0 13598 31.7799383
## 22 70-79     1-5 15391 35.9703655
## 23 70-79    >= 6  8527 19.9284846
## 24 70-79     All  5272 12.3212116
## 25   80+       0  4747 26.2120375
## 26   80+     1-5  6209 34.2849255
## 27   80+    >= 6  4251 23.4732192
## 28   80+     All  2903 16.0298178

RMVTETH and AGE

ggplot( )+
  geom_bar(data = df2_AgeRmvteth, 
           aes(x = AGE, 
               y = Percentage,
               fill =  RMVTETH),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Age group") +
  scale_fill_manual(values = col4) +
  scale_y_continuous(breaks = seq(0, 100, 20)) +
  themebar_stacked

RMVTETH and EDU

ggplot(BRFSS, aes(x= RMVTETH , group = EDU)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ EDU) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= RMVTETH , group = EMPL)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ EMPL) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= RMVTETH , group = INCOME)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= RMVTETH , group = EXERpa30)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="RMVTETH") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= RMVTETH , group = SMOKESTAT)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ SMOKESTAT) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(data = df2_AgeRmvteth, aes(x = AGE, 
                           y = Percentage, 
                           group = RMVTETH)) +
  geom_line(linetype = "dashed", 
            aes(color = RMVTETH), 
            size = 0.5) +
  geom_point(aes(color = RMVTETH), 
             size = 6) +
  scale_color_manual(values = col4) +
  labs(y = "Percent of respondents (%)", 
       x = "Age group") +
  theme_point +
  guides(color=guide_legend("Nb of removed teeth"))

H1.LASTDENTV

df_LASTDENTV <- BRFSS %>% 
  group_by(LASTDENTV) %>% 
  dplyr::summarise(Count =  n()) %>% 
  mutate(LASTDENTVgroup = factor(LASTDENTV),
         Percentage = round(Count*100/sum(Count),2),
         label =  percent(Percentage/100))

ggplot(df_LASTDENTV, 
       aes(x = LASTDENTV, 
           y = Percentage)) +
  geom_bar(stat = "identity",
           color = "darkgreen", 
           fill = "darkgreen",
           width = 0.6) +
  scale_y_continuous(breaks = seq(0,70,10)) +
  geomtext_bar +
  labs(x = "Last time visited dentist",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

df_RmvtethLastdentv <- BRFSS %>% 
  group_by(LASTDENTV, RMVTETH) %>% 
  dplyr::summarise(Count =  n()) 

df_RmvtethLastdentv

## # A tibble: 16 x 3
## # Groups:   LASTDENTV [4]
##    LASTDENTV RMVTETH  Count
##    <fct>     <ord>    <int>
##  1 <1        0       110367
##  2 <1        1-5      60837
##  3 <1        >= 6     19637
##  4 <1        All       4618
##  5 1-<2      0        13766
##  6 1-<2      1-5       9053
##  7 1-<2      >= 6      4292
##  8 1-<2      All       2104
##  9 2-<5      0        10687
## 10 2-<5      1-5       7089
## 11 2-<5      >= 6      3942
## 12 2-<5      All       2771
## 13 5+        0        11500
## 14 5+        1-5       6147
## 15 5+        >= 6      4897
## 16 5+        All       7865

df2_RmvtethLastdentv <- ddply(df_RmvtethLastdentv,.(LASTDENTV),transform, Percentage = Count*100/sum(Count))
df2_RmvtethLastdentv

##    LASTDENTV RMVTETH  Count Percentage
## 1         <1       0 110367  56.465550
## 2         <1     1-5  60837  31.125198
## 3         <1    >= 6  19637  10.046608
## 4         <1     All   4618   2.362644
## 5       1-<2       0  13766  47.119630
## 6       1-<2     1-5   9053  30.987506
## 7       1-<2    >= 6   4292  14.691083
## 8       1-<2     All   2104   7.201780
## 9       2-<5       0  10687  43.640002
## 10      2-<5     1-5   7089  28.947691
## 11      2-<5    >= 6   3942  16.097023
## 12      2-<5     All   2771  11.315284
## 13        5+       0  11500  37.817751
## 14        5+     1-5   6147  20.214410
## 15        5+    >= 6   4897  16.103785
## 16        5+     All   7865  25.864053

ggplot( )+
  geom_bar(data = df2_RmvtethLastdentv, 
           aes(x = LASTDENTV, 
               y = Percentage,
               fill =  RMVTETH),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Last time visited dentist") +
  scale_fill_manual(values = col4) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  themebar_stacked

ggplot(data = df2_RmvtethLastdentv, aes(x = LASTDENTV, 
                           y = Percentage, 
                           group = RMVTETH)) +
  geom_line(linetype = "dashed", 
            aes(color = RMVTETH), 
            size = 0.5) +
  geom_point(aes(color = RMVTETH), 
             size = 6) +
  scale_color_manual(values = col4) +
  labs(y = "Percent of respondents (%)", x = "Last time visited dental clinic") +
  theme_point +
  guides(color=guide_legend("Nb of removed teeth"))

Visiting dentist and income

ggplot(BRFSS, aes(x= LASTDENTV , group = INCOME)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= LASTDENTV, group = EXERpa30)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="IRMVTETH") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= LASTDENTV, group = HLTHPLN)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="RMVTETH") +
    facet_grid(~ HLTHPLN) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= LASTDENTV, group = MEDICOST)) + # teeth removed by age
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill="RMVTETH") +
    facet_grid(~ MEDICOST) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

df_IncomeLastdentv <- BRFSS %>% 
  group_by(LASTDENTV, INCOME) %>% 
  dplyr::summarise(Count =  n()) 

df2_IncomeLastdentv <- ddply(df_IncomeLastdentv,.(LASTDENTV),transform, Percentage = Count*100/sum(Count))

df2_IncomeLastdentv

##    LASTDENTV     INCOME Count Percentage
## 1         <1      <$20K 18372   9.399414
## 2         <1 $20K-<$35K 29558  15.122353
## 3         <1 $35K-<$50K 26273  13.441694
## 4         <1 $50K-<$75K 35474  18.149075
## 5         <1      $75K+ 85782  43.887465
## 6       1-<2      <$20K  5811  19.890467
## 7       1-<2 $20K-<$35K  6872  23.522163
## 8       1-<2 $35K-<$50K  4488  15.361972
## 9       1-<2 $50K-<$75K  4558  15.601575
## 10      1-<2      $75K+  7486  25.623823
## 11      2-<5      <$20K  5836  23.831108
## 12      2-<5 $20K-<$35K  6277  25.631916
## 13      2-<5 $35K-<$50K  3705  15.129242
## 14      2-<5 $50K-<$75K  3621  14.786231
## 15      2-<5      $75K+  5050  20.621504
## 16        5+      <$20K 10005  32.901444
## 17        5+ $20K-<$35K  8839  29.067053
## 18        5+ $35K-<$50K  4211  13.847874
## 19        5+ $50K-<$75K  3474  11.424249
## 20        5+      $75K+  3880  12.759380

ggplot( )+
  geom_bar(data = df2_IncomeLastdentv, 
           aes(x = LASTDENTV, 
               y = Percentage,
               fill =  INCOME),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Last time visited dentist") +
  scale_fill_manual(values = col5) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  themebar_stacked

df3_IncomeLastdentv <- ddply(df_IncomeLastdentv,.(INCOME),transform, Percentage = Count*100/sum(Count))
df3_IncomeLastdentv

##    LASTDENTV     INCOME Count Percentage
## 1         <1      <$20K 18372  45.902459
## 2       1-<2      <$20K  5811  14.518789
## 3       2-<5      <$20K  5836  14.581251
## 4         5+      <$20K 10005  24.997501
## 5         <1 $20K-<$35K 29558  57.342956
## 6       1-<2 $20K-<$35K  6872  13.331781
## 7       2-<5 $20K-<$35K  6277  12.177473
## 8         5+ $20K-<$35K  8839  17.147790
## 9         <1 $35K-<$50K 26273  67.929260
## 10      1-<2 $35K-<$50K  4488  11.603796
## 11      2-<5 $35K-<$50K  3705   9.579337
## 12        5+ $35K-<$50K  4211  10.887608
## 13        <1 $50K-<$75K 35474  75.273198
## 14      1-<2 $50K-<$75K  4558   9.671738
## 15      2-<5 $50K-<$75K  3621   7.683494
## 16        5+ $50K-<$75K  3474   7.371570
## 17        <1      $75K+ 85782  83.937063
## 18      1-<2      $75K+  7486   7.324997
## 19      2-<5      $75K+  5050   4.941388
## 20        5+      $75K+  3880   3.796552

ggplot( )+
  geom_bar(data = df3_IncomeLastdentv, 
           aes(x = INCOME , 
               y = Percentage,
               fill =  LASTDENTV),
               stat = "identity")+
   labs(y = "Percent of respondents (%)", 
        x = "Income bracket") +
  scale_fill_manual(values = col4) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  themebar_stacked

ggplot(data = df3_IncomeLastdentv, 
       aes(x = INCOME, 
           y = Percentage,
           group = LASTDENTV)) +
  geom_line(linetype = "dashed", 
            aes(color = LASTDENTV), 
            size = 0.5) +
  geom_point(aes(color = LASTDENTV), 
             size = 6) +
  scale_color_manual(values = col5) +
  labs(x = "Income range",
       y = "Percent of respondents (%)") +
  theme_point +
  guides(color=guide_legend("Last time visited dentist"))

Last than 1 year — more frequent than from 1 to 2 year higher income - visit dentist more

df_LASTDENTVRACE <- BRFSS %>% 
  select(LASTDENTV, RACE) %>% 
  filter(RACE == "White" | RACE == "Black"| RACE == "Hispanic" | RACE == "Asian") %>% 
  group_by(LASTDENTV, RACE) %>% 
  dplyr::summarise(Count =  n()) 


df2_LASTDENTVRACE <- ddply(df_LASTDENTVRACE,.(RACE),transform, Percentage = round(Count*100/sum(Count),2))

df2_LASTDENTVRACE

##    LASTDENTV     RACE  Count Percentage
## 1         <1    Asian   4114      72.56
## 2       1-<2    Asian    661      11.66
## 3       2-<5    Asian    468       8.25
## 4         5+    Asian    427       7.53
## 5         <1    Black  13113      60.70
## 6       1-<2    Black   3242      15.01
## 7       2-<5    Black   2509      11.61
## 8         5+    Black   2738      12.67
## 9         <1 Hispanic  11553      61.72
## 10      1-<2 Hispanic   2741      14.64
## 11      2-<5 Hispanic   2176      11.63
## 12        5+ Hispanic   2247      12.01
## 13        <1    White 158845      71.95
## 14      1-<2    White  20871       9.45
## 15      2-<5    White  17920       8.12
## 16        5+    White  23125      10.48

ggplot(df2_LASTDENTVRACE, 
       aes(x = LASTDENTV, 
           y = Percentage,
           fill = LASTDENTV)) +
  geom_bar(stat = "identity",
           width = 0.75) +
  scale_fill_manual(values= c("blue", "cyan4", "cyan3","cyan2", "red")) +
  scale_y_continuous(breaks = seq(0,100,20)) +
  geomtext_bar +
  facet_wrap(~ RACE, nrow = 2) +
  labs(x = "Last time visited dentist (Year)",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018") +
  theme_bar

G. CHRONIC HEALTH CONDITIONS

geombar_CHC <- geom_bar(aes(y = (..count..)/sum(..count..)),
           fill = "firebrick4",
           alpha = 0.9,
           width = 0.5) 
  
geomtext_CHC <- geom_text(size = 4,
            color= "blue",
            aes(y = ((..count..)/sum(..count..)),
                label = scales::percent((..count..)/sum(..count..))),
            stat = "count", 
            vjust = -0.5)

G1.ARTHRITIS

ggplot(BRFSS, aes(x = ARTHRITIS)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent) +
  labs(x = "Arthritis?",
       y = "Percent of Respondents",
      caption = "Data source: BRFSS2018") + 
  theme_bar

G2. PULMOND chronic obstructive pulmonary disease, C.O.P.D., emphysema or chronic bronchitis?

ggplot(BRFSS, aes(x = PULMOND)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "PULMOND?",
       y = "Percent of Respondents",
       caption = "Data source: BRFSS2018") +
  theme_bar

G3.ASTHMA

ggplot(BRFSS, aes(x = ASTHMA)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "ASTHMA?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

G4.DEPRESSION

ggplot(BRFSS, aes(x = DEPRESSION)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "DEPRESSION?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

G5.KIDNEY

ggplot(BRFSS, aes(x = KIDNEY)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "DEPRESSION?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

G6. DIABETE

ggplot(BRFSS, aes(x = DIABETE)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "DEPRESSION?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

DIABETE and BMI and EXERCISE

ggplot(BRFSS, aes(x = DIABETE,  group = BMICAT)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.6) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col2) +
    labs(y = "Percent of respondents", fill = "EDIABETE") +
    facet_grid(~ BMICAT) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

ggplot(BRFSS, aes(x = BMICAT,  group = DIABETE)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat = "count",
             width = 0.6) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat = "count",
              color = "black",
              vjust = -.25) +
  scale_fill_manual(values= col4) +
    labs(y = "Percent of respondents", fill = "BMICAT") +
    facet_grid(~ DIABETE) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

library(ggpubr)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:plyr':
## 
##     mutate

## The following object is masked from 'package:ggimage':
## 
##     theme_transparent

theme_boxplot <-   theme(text = element_text(size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 12,face="bold"),
        plot.caption = element_text(color = "grey",size = 12, face = "italic"),
        strip.text = element_text(size = 12, color = "blue", face = "bold.italic"),
        strip.background = element_rect(fill = "slategray1"),
        legend.position = "none")

ggboxplot(BRFSS, 
          x = "DIABETE", 
          y = "SLEPTIM1",
          fill = "DIABETE",
          ylab = "Sleep duration (hours)", 
          xlab = "DIABETE status") +
  theme_boxplot

# Compute the analysis of variance
diabete.aov <- aov(SLEPTIM1 ~  DIABETE, data = BRFSS)
# Summary of the analysis
summary(diabete.aov)

##                 Df Sum Sq Mean Sq F value Pr(>F)
## DIABETE          1      1  0.5602   0.333  0.564
## Residuals   279570 469878  1.6807

As the p-value is higher than the significance level 0.05, we can conclude that there are NO significant differences between the groups

TukeyHSD(diabete.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SLEPTIM1 ~ DIABETE, data = BRFSS)
## 
## $DIABETE
##               diff          lwr        upr     p adj
## Yes-No 0.004077576 -0.009764728 0.01791988 0.5637007

# 1. Homogeneity of variances
plot(diabete.aov, 1)

In the plot above, there is no evident relationships between residuals and fitted values (the mean of each groups), which is good. So, we can assume the homogeneity of variances.

plot(diabete.aov, 2)

G7. CANCER

ggplot(BRFSS, aes(x = CANCER)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "DEPRESSION?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

G7. STROKE

ggplot(BRFSS, aes(x = STROKE)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "STROKE?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

ChronicHC <- data.frame(
  CHC = c("ARTHRITIS",
         'ASTHMA',
          "CANCER",
          "DEPRESSION",
          "DIABETE",
          "KIDNEY",
         "PLUMOND", 
          "STROKE",
         "MICHD"),
  Percentage = c(33.40, 13.82, 16.97, 18.81, 14.02, 3.59, 7.95, 3.94, 8.80 ))

ChronicHC

##          CHC Percentage
## 1  ARTHRITIS      33.40
## 2     ASTHMA      13.82
## 3     CANCER      16.97
## 4 DEPRESSION      18.81
## 5    DIABETE      14.02
## 6     KIDNEY       3.59
## 7    PLUMOND       7.95
## 8     STROKE       3.94
## 9      MICHD       8.80

col8 <-  c("#0072BB", "#FF4C3B","#FFD034","#51A39D","#06425C", "#C6C8CA","#814374", "red", "salmon")

ggplot(data = ChronicHC, aes(x= reorder(CHC, -Percentage),
                             y = Percentage,
                             fill =  as.factor(Percentage))) + 
  geom_bar(stat = "identity",
           width = 0.60) +
  geom_text(aes(label = paste0(Percentage,"%")), 
              vjust = -0.25,
              size = 4.5,
              color = "black") +
   scale_fill_manual(values = col8) +
  theme(legend.position="none") +
    labs(x = "Chronic health condition",
       y = "Percent of Respondents (%)",
        caption = "Data source: BRFSS2018")+
  theme_bar

MICHD*

ggplot(BRFSS, aes(x = MICHD)) +
  geombar_CHC +
  geomtext_CHC +
  scale_y_continuous(labels = percent)+
  labs(x = "DEPRESSION?",
       y = "Percent of Respondents",
        caption = "Data source: BRFSS2018") +
  theme_bar

df_CHC <- data.frame(
  ChronicHLTHcond = c("CANCER","CANCER",
                      "DIABETE","DIABETE",
                      "KIDNEY","KIDNEY",
                      "DEPRESSION","DEPRESSION",
                      "ASTHMA","ASTHMA",
                      "PULMOND","PULMOND",
                      "ARTHRITIS","ARTHRITIS",
                      "STROKE","STROKE",
                      "MICHD","MICHD"),
  Status = rep(c("NO", "YES"),9),
  Percentage = c(83, 17, 
                 86, 14, 
                 96, 4,
                 81, 19,
                 86, 14,
                 92, 8,
                 67, 33,
                 96, 4,
                 91, 9))
df_CHC %>% 
  pander()

ChronicHLTHcond	Status	Percentage
CANCER	NO	83
CANCER	YES	17
DIABETE	NO	86
DIABETE	YES	14
KIDNEY	NO	96
KIDNEY	YES	4
DEPRESSION	NO	81
DEPRESSION	YES	19
ASTHMA	NO	86
ASTHMA	YES	14
PULMOND	NO	92
PULMOND	YES	8
ARTHRITIS	NO	67
ARTHRITIS	YES	33
STROKE	NO	96
STROKE	YES	4
MICHD	NO	91
MICHD	YES	9

df2_CHC <- df_CHC %>% 
  mutate(lab_ypos = cumsum(Percentage/10) - 0.5* Percentage/10) 

df2_CHC

##    ChronicHLTHcond Status Percentage lab_ypos
## 1           CANCER     NO         83     4.15
## 2           CANCER    YES         17     9.15
## 3          DIABETE     NO         86    14.30
## 4          DIABETE    YES         14    19.30
## 5           KIDNEY     NO         96    24.80
## 6           KIDNEY    YES          4    29.80
## 7       DEPRESSION     NO         81    34.05
## 8       DEPRESSION    YES         19    39.05
## 9           ASTHMA     NO         86    44.30
## 10          ASTHMA    YES         14    49.30
## 11         PULMOND     NO         92    54.60
## 12         PULMOND    YES          8    59.60
## 13       ARTHRITIS     NO         67    63.35
## 14       ARTHRITIS    YES         33    68.35
## 15          STROKE     NO         96    74.80
## 16          STROKE    YES          4    79.80
## 17           MICHD     NO         91    84.55
## 18           MICHD    YES          9    89.55

ggplot(df2_CHC, 
       aes(x = ChronicHLTHcond,
           y = Percentage,
           fill = Status)) + 
    geom_bar(position = "dodge", 
             stat = "identity")+
    geom_text(aes(label = paste0(Percentage,"%")), 
              position = position_dodge(width=0.9), 
              vjust = -0.25,
              size = 5,
              color = "black") +
  scale_fill_manual(values=c("#5F9EA0", "#E1B378")) +
  labs(x = "Chronic Healthe Condition",
       y = "Percent of Respondents (%)",
        caption = "Data source: BRFSS2018") +
theme_bar

BRFSS %>% 
  group_by(CANCER, KIDNEY, ASTHMA, ARTHRITIS, PULMOND, DEPRESSION, DIABETE, STROKE, MICHD) %>% 
  dplyr::count() %>% 
  head(20)

## # A tibble: 20 x 10
## # Groups:   CANCER, KIDNEY, ASTHMA, ARTHRITIS, PULMOND, DEPRESSION, DIABETE,
## #   STROKE, MICHD [20]
##    CANCER KIDNEY ASTHMA ARTHRITIS PULMOND DEPRESSION DIABETE STROKE MICHD      n
##    <fct>  <fct>  <fct>  <fct>     <fct>   <fct>      <fct>   <fct>  <fct>  <int>
##  1 No     No     No     No        No      No         No      No     No    105809
##  2 No     No     No     No        No      No         No      No     Yes     2988
##  3 No     No     No     No        No      No         No      Yes    No      1080
##  4 No     No     No     No        No      No         No      Yes    Yes      340
##  5 No     No     No     No        No      No         Yes     No     No      8738
##  6 No     No     No     No        No      No         Yes     No     Yes     1000
##  7 No     No     No     No        No      No         Yes     Yes    No       350
##  8 No     No     No     No        No      No         Yes     Yes    Yes      161
##  9 No     No     No     No        No      Yes        No      No     No     15038
## 10 No     No     No     No        No      Yes        No      No     Yes      445
## 11 No     No     No     No        No      Yes        No      Yes    No       247
## 12 No     No     No     No        No      Yes        No      Yes    Yes       64
## 13 No     No     No     No        No      Yes        Yes     No     No      1490
## 14 No     No     No     No        No      Yes        Yes     No     Yes      170
## 15 No     No     No     No        No      Yes        Yes     Yes    No        84
## 16 No     No     No     No        No      Yes        Yes     Yes    Yes       51
## 17 No     No     No     No        Yes     No         No      No     No      1887
## 18 No     No     No     No        Yes     No         No      No     Yes      383
## 19 No     No     No     No        Yes     No         No      Yes    No        71
## 20 No     No     No     No        Yes     No         No      Yes    Yes       55

106927 people are CHC free 279966 - 106927 = 173039 (61.8%) people have at least one chronic health condition 43 people have all the CHCs (8)

df3_CHC <- BRFSS %>% 
  mutate(Cancer =  case_when(CANCER == "Yes" ~ 1,
                             CANCER == "No" ~ 0),
         Kidney =  case_when(KIDNEY == "Yes" ~ 1,
                             KIDNEY  == "No" ~ 0),
         Asthma =  case_when(ASTHMA == "Yes" ~ 1,
                             ASTHMA == "No" ~ 0),
         Arthritis = case_when(ARTHRITIS == "Yes" ~ 1,
                                ARTHRITIS == "No" ~ 0),
         Pulmond =  case_when(PULMOND == "Yes" ~ 1,
                              PULMOND == "No" ~ 0),
         Depression =  case_when(DEPRESSION == "Yes" ~ 1,
                                 DEPRESSION == "No" ~ 0),
         Diabete =  case_when(DIABETE == "Yes" ~ 1,
                              DIABETE == "No" ~ 0),
         Stroke = case_when(STROKE == "Yes" ~ 1,
                            STROKE == "No" ~ 0),
         Michd =  case_when(MICHD == "Yes" ~ 1,
                            MICHD == "No" ~ 0),
         )

Take only column with numeric values

df4_CHC <- df3_CHC %>% 
  select(Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete, Stroke,Michd)

df5_CHC <-  df4_CHC %>% 
  group_by(Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete, Stroke, Michd) %>% 
  dplyr::count(name = "subtotal_case")

head(df5_CHC,30)

## # A tibble: 30 x 10
## # Groups:   Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete,
## #   Stroke, Michd [30]
##    Cancer Kidney Asthma Arthritis Pulmond Depression Diabete Stroke Michd
##     <dbl>  <dbl>  <dbl>     <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <dbl>
##  1      0      0      0         0       0          0       0      0     0
##  2      0      0      0         0       0          0       0      0     1
##  3      0      0      0         0       0          0       0      1     0
##  4      0      0      0         0       0          0       0      1     1
##  5      0      0      0         0       0          0       1      0     0
##  6      0      0      0         0       0          0       1      0     1
##  7      0      0      0         0       0          0       1      1     0
##  8      0      0      0         0       0          0       1      1     1
##  9      0      0      0         0       0          1       0      0     0
## 10      0      0      0         0       0          1       0      0     1
## # ... with 20 more rows, and 1 more variable: subtotal_case <int>

tail(df5_CHC)

## # A tibble: 6 x 10
## # Groups:   Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete,
## #   Stroke, Michd [6]
##   Cancer Kidney Asthma Arthritis Pulmond Depression Diabete Stroke Michd
##    <dbl>  <dbl>  <dbl>     <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <dbl>
## 1      1      1      1         1       1          1       0      1     0
## 2      1      1      1         1       1          1       0      1     1
## 3      1      1      1         1       1          1       1      0     0
## 4      1      1      1         1       1          1       1      0     1
## 5      1      1      1         1       1          1       1      1     0
## 6      1      1      1         1       1          1       1      1     1
## # ... with 1 more variable: subtotal_case <int>

df5_CHC$NbofCHC <- rowSums(df5_CHC[,c(1:9)])
df6_CHC <- df5_CHC

df6_CHC

## # A tibble: 507 x 11
## # Groups:   Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete,
## #   Stroke, Michd [507]
##    Cancer Kidney Asthma Arthritis Pulmond Depression Diabete Stroke Michd
##     <dbl>  <dbl>  <dbl>     <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <dbl>
##  1      0      0      0         0       0          0       0      0     0
##  2      0      0      0         0       0          0       0      0     1
##  3      0      0      0         0       0          0       0      1     0
##  4      0      0      0         0       0          0       0      1     1
##  5      0      0      0         0       0          0       1      0     0
##  6      0      0      0         0       0          0       1      0     1
##  7      0      0      0         0       0          0       1      1     0
##  8      0      0      0         0       0          0       1      1     1
##  9      0      0      0         0       0          1       0      0     0
## 10      0      0      0         0       0          1       0      0     1
## # ... with 497 more rows, and 2 more variables: subtotal_case <int>,
## #   NbofCHC <dbl>

All the people living with at one chronic health condition

df6_CHC %>% 
  filter(NbofCHC == 1)

## # A tibble: 9 x 11
## # Groups:   Cancer, Kidney, Asthma, Arthritis, Pulmond, Depression, Diabete,
## #   Stroke, Michd [9]
##   Cancer Kidney Asthma Arthritis Pulmond Depression Diabete Stroke Michd
##    <dbl>  <dbl>  <dbl>     <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <dbl>
## 1      0      0      0         0       0          0       0      0     1
## 2      0      0      0         0       0          0       0      1     0
## 3      0      0      0         0       0          0       1      0     0
## 4      0      0      0         0       0          1       0      0     0
## 5      0      0      0         0       1          0       0      0     0
## 6      0      0      0         1       0          0       0      0     0
## 7      0      0      1         0       0          0       0      0     0
## 8      0      1      0         0       0          0       0      0     0
## 9      1      0      0         0       0          0       0      0     0
## # ... with 2 more variables: subtotal_case <int>, NbofCHC <dbl>

df6_CHC %>% 
  group_by(NbofCHC) %>% 
  dplyr::summarise(totalcase = sum(subtotal_case))

## # A tibble: 10 x 2
##    NbofCHC totalcase
##      <dbl>     <int>
##  1       0    105809
##  2       1     81744
##  3       2     48701
##  4       3     24441
##  5       4     11351
##  6       5      4861
##  7       6      1886
##  8       7       617
##  9       8       135
## 10       9        27

df7_CHC <- df6_CHC %>% 
  group_by(NbofCHC) %>% 
  dplyr::summarise(Total_case = sum(subtotal_case))

df7_CHC

## # A tibble: 10 x 2
##    NbofCHC Total_case
##      <dbl>      <int>
##  1       0     105809
##  2       1      81744
##  3       2      48701
##  4       3      24441
##  5       4      11351
##  6       5       4861
##  7       6       1886
##  8       7        617
##  9       8        135
## 10       9         27

ggplot(data = df7_CHC,
           aes(x = NbofCHC, 
           y  = Total_case,
           fill =  Total_case)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "darkred", high = "yellow")+
  geom_text(aes(label = Total_case), 
            vjust = -0.5, 
            color = "darkblue", 
            size = 4) +
  theme_minimal() +
  scale_y_continuous(limits = c(0,120000),
                     breaks = seq(0,120000, by = 20000))+
  scale_x_continuous(breaks = seq(0,9,by = 1))+
  labs(x = "Number of chronic health conditions",
       y = "Frequency",
       caption = "Data source: BRFSS2018") +
  theme_bar

df8_CHC <- df7_CHC %>% 
  mutate(Percentage = round(Total_case*100/sum(Total_case),2))

df8_CHC

## # A tibble: 10 x 3
##    NbofCHC Total_case Percentage
##      <dbl>      <int>      <dbl>
##  1       0     105809      37.8 
##  2       1      81744      29.2 
##  3       2      48701      17.4 
##  4       3      24441       8.74
##  5       4      11351       4.06
##  6       5       4861       1.74
##  7       6       1886       0.67
##  8       7        617       0.22
##  9       8        135       0.05
## 10       9         27       0.01

ggplot(data = df8_CHC,
           aes(x = NbofCHC, 
               y  = Percentage,
               fill = Percentage)) +
  scale_fill_gradient(low = "darkred", high = "yellow")+
  geom_bar(stat = "identity")+
  geom_text(aes(label = paste0(Percentage,"%", "\n", Total_case)), 
            vjust = -0.1, 
            color = "darkblue", 
            size = 4.5) +
  scale_x_continuous(breaks = seq(0,9,by = 1))+
  scale_y_continuous(limits = c(0,40)) +
  theme_minimal() +
  labs(x = "Number of chronic health conditions",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size=12),
        axis.title = element_text(size=12,face="bold"),
        plot.caption = element_text(color = "grey44",size = 12, face = "italic"),
        legend.title = element_text(color = "blue", size = 12),
        legend.text = element_text(size=12),
        legend.position = "non")

Among American aged 18 and more, almost 1 out of every 3 (~ 33%) adults were overweight or/and obese while 1.6% were underweight, based on their measured BMI.

SLEEPTIM (numeric attribute and chronic health condtion) A. Sleep duration and ASTHMA

ggplot(BRFSS, aes(x= SLEPTIM1 , group = ASTHMA)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
         stat="count",width = 1) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~CANCER) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

sleptime_asthma <- ggplot(BRFSS, 
       aes(x = ASTHMA, 
           y = SLEPTIM1,
           fill = ASTHMA)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs( x = "Cancer status",
        y = "Sleep duration (hours)",
        caption = "Data source: BRFSS2018")

sleptime_asthma + theme_boxplot

Assumption 3. Do the two populations have the same variances? We’ll use F-test to test for homogeneity in variances. This can be performed with the function var.test() as follow:

res.ftest_ASTHMA <- var.test(SLEPTIM1 ~ ASTHMA, data = BRFSS)
res.ftest_ASTHMA

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by ASTHMA
## F = 0.73867, num df = 240911, denom df = 38659, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.7275073 0.7499429
## sample estimates:
## ratio of variances 
##          0.7386712

res.ftest_ARTHRITIS <- var.test(SLEPTIM1 ~ ARTHRITIS, data = BRFSS, alternative = "two.sided")
res.ftest_ARTHRITIS

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by ARTHRITIS
## F = 0.6884, num df = 186212, denom df = 93358, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6807799 0.6960839
## sample estimates:
## ratio of variances 
##          0.6883965

res.ftest_CANCER <- var.test(SLEPTIM1 ~ CANCER, data = BRFSS, alternative = "two.sided")
res.ftest_CANCER

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by CANCER
## F = 0.90813, num df = 232122, denom df = 47448, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8955097 0.9208742
## sample estimates:
## ratio of variances 
##           0.908133

res.ftest_DEPRESSION <- var.test(SLEPTIM1 ~ DEPRESSION, data = BRFSS, alternative = "two.sided")
res.ftest_DEPRESSION

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by DEPRESSION
## F = 0.57769, num df = 226976, denom df = 52594, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5699777 0.5854759
## sample estimates:
## ratio of variances 
##          0.5776913

res.ftest_DIABETE <- var.test(SLEPTIM1 ~ DIABETE, data = BRFSS, alternative = "two.sided")
res.ftest_DIABETE

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by DIABETE
## F = 0.6913, num df = 240382, denom df = 39188, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6809148 0.7017925
## sample estimates:
## ratio of variances 
##          0.6913035

res.ftest_KIDNEY <- var.test(SLEPTIM1 ~ KIDNEY, data = BRFSS, alternative = "two.sided")
res.ftest_KIDNEY

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by KIDNEY
## F = 0.62827, num df = 269521, denom df = 10049, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6107080 0.6460933
## sample estimates:
## ratio of variances 
##          0.6282687

res.ftest_PULMOND <- var.test(SLEPTIM1 ~ PULMOND, data = BRFSS, alternative = "two.sided")
res.ftest_PULMOND

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by PULMOND
## F = 0.54672, num df = 257356, denom df = 22214, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5361798 0.5573743
## sample estimates:
## ratio of variances 
##          0.5467181

res.ftest_STROKE <- var.test(SLEPTIM1 ~ STROKE, data = BRFSS,  alternative = "two.sided")
res.ftest_STROKE

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by STROKE
## F = 0.55674, num df = 268546, denom df = 11024, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5418525 0.5718449
## sample estimates:
## ratio of variances 
##           0.556741

res.ftest_MICHD <- var.test(SLEPTIM1 ~ MICHD, data = BRFSS)
res.ftest_MICHD

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by MICHD
## F = 0.65222, num df = 254977, denom df = 24593, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6402132 0.6643555
## sample estimates:
## ratio of variances 
##          0.6522193

t.test(SLEPTIM1 ~ ASTHMA, data = BRFSS)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by ASTHMA
## t = 25.728, df = 48258, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1880149 0.2190244
## sample estimates:
##  mean in group No mean in group Yes 
##          7.046199          6.842680

All the F-test have p < 0.01 ==> reject the Null Hypotheses == accept the alternative hypothesis == There are difference between two variances of two groups. var.equal will set to be FALSE

t.test(SLEPTIM1 ~ ARTHRITIS, data = BRFSS, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by ARTHRITIS
## t = 12.489, df = 159401, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05818593 0.07984851
## sample estimates:
##  mean in group No mean in group Yes 
##          7.041103          6.972086

t.test(SLEPTIM1 ~ CANCER, data = BRFSS, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by CANCER
## t = -25.076, df = 66232, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1820817 -0.1556816
## sample estimates:
##  mean in group No mean in group Yes 
##          6.989394          7.158275

t.test(SLEPTIM1 ~ DEPRESSION, data = BRFSS, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by DEPRESSION
## t = 25.771, df = 67338, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1765140 0.2055738
## sample estimates:
##  mean in group No mean in group Yes 
##          7.053997          6.862953

t.test(SLEPTIM1 ~ DIABETE, data = BRFSS, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by DIABETE
## t = -0.5059, df = 48419, p-value = 0.6129
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.01987534  0.01172019
## sample estimates:
##  mean in group No mean in group Yes 
##          7.017485          7.021562

t.test(SLEPTIM1 ~ KIDNEY, alternative = "two.sided", var.equal = FALSE, data = BRFSS)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by KIDNEY
## t = 0.46426, df = 10525, p-value = 0.6425
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02443215  0.03959710
## sample estimates:
##  mean in group No mean in group Yes 
##          7.018329          7.010746

t.test(SLEPTIM1 ~ PULMOND, alternative = "two.sided", var.equal = FALSE, data = BRFSS)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by PULMOND
## t = 14.65, df = 24355, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1478766 0.1935567
## sample estimates:
##  mean in group No mean in group Yes 
##          7.031621          6.860905

t.test(SLEPTIM1 ~ STROKE, alternative = "two.sided", var.equal = FALSE, data = BRFSS)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by STROKE
## t = -3.1169, df = 11533, p-value = 0.001832
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08366195 -0.01906121
## sample estimates:
##  mean in group No mean in group Yes 
##          7.016031          7.067392

t.test(SLEPTIM1 ~ MICHD, alternative = "two.sided", var.equal = FALSE, data = BRFSS)

## 
##  Welch Two Sample t-test
## 
## data:  SLEPTIM1 by MICHD
## t = -2.5889, df = 27774, p-value = 0.009634
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.046917246 -0.006485714
## sample estimates:
##  mean in group No mean in group Yes 
##          7.015707          7.042409

res.ftest <- var.test(SLEPTIM1 ~ CANCER, data = BRFSS)
res.ftest

## 
##  F test to compare two variances
## 
## data:  SLEPTIM1 by CANCER
## F = 0.90813, num df = 232122, denom df = 47448, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8955097 0.9208742
## sample estimates:
## ratio of variances 
##           0.908133

There are significant difference between the variance of two sets of dsata. Therefore, we will use t-test with unequal variance.

ggplot(BRFSS, 
       aes(x = PULMOND, 
           y = SLEPTIM1)) +
  geom_violin(fill = "cornflowerblue") +
  geom_boxplot(width = .2, 
               fill = "orange",
               outlier.color = "orange",
               outlier.size = 2) +
  labs(title = "Sleep duration distribution by PULMOND status")

ggplot(BRFSS, 
       aes(x = MICHD, 
           y = SLEPTIM1, fill = MICHD)) +
  geom_violin(trim = FALSE,fill = "cornflowerblue") +
  geom_boxplot(width = 0.1, 
               fill = "grey",
               outlier.color = "grey",
               outlier.size = 2) +
  labs(title = "Sleep duration distribution by MICHD status")

library(ggforce)

Numerical Data Descriptive Statistics

ggplot(data = BRFSS, aes(SLEPTIM1)) + 
  geom_bar(stat= "count", 
                 fill= "olivedrab4") +  
  coord_cartesian(clip = "off") +
  labs( x = "Sleep duration (hours)",
        y = "Frequency",
        subtitle = "On average, how many hours of sleep do you get in a 24-hour period?",
        caption = "Data source: BRFSS2018") + 
  scale_y_continuous(limits = c(0, 100000), expand = c(0, 0)) +
  scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
  theme_bar

****BIVARIATE analysis 1****

ggplot(BRFSS, aes(x= SLEPTIM1 , group = CANCER)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
         stat="count",width = 1) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~CANCER) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

Boxplot

library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggpubr':
## 
##     get_legend

## The following object is masked from 'package:ggimage':
## 
##     theme_nothing

library(ggplot2)

theme_boxplot <-   theme(text = element_text(size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 12,face="bold"),
        plot.caption = element_text(color = "grey",size = 12, face = "italic"),
        strip.text = element_text(size = 12, color = "blue", face = "bold.italic"),
        strip.background = element_rect(fill = "slategray1"),
        legend.position = "none")

sleptime_cancer <- ggplot(BRFSS, 
       aes(x = CANCER, 
           y = SLEPTIM1,
           fill = CANCER)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs( x = "Cancer status",
        y = "Sleep duration (hours)",
        caption = "Data source: BRFSS2018")

sleptime_cancer + theme_boxplot

BRFSS %>% 
  summarise(mean = mean(SLEPTIM1),
            std = sd(SLEPTIM1),
            median = median(SLEPTIM1))

##       mean      std median
## 1 7.018056 1.296423      7

Compare two means sample size

BRFSS %>% 
  group_by(CANCER) %>% 
  dplyr::count()

## # A tibble: 2 x 2
## # Groups:   CANCER [2]
##   CANCER      n
##   <fct>   <int>
## 1 No     232123
## 2 Yes     47449

library(purrr)

BRFSS %>% 
  split(.$CANCER) %>% 
  map(summary) %>% 
  pander()

No:

Table continues below
IDAY	MONTH	Region	AGE	GENDER
Min. : 1.00	Oct : 22689	Midwest :63979	18-29:28992	Female:119985
1st Qu.: 8.00	Mar : 21969	Northeast:47722	30-39:33224	Male :112138
Median :15.00	Nov : 20695	South :68624	40-49:35697	NA
Mean :14.97	Fev : 20539	West :51798	50-59:46174	NA
3rd Qu.:22.00	July : 20391	NA	60-69:49234	NA
Max. :31.00	Sept : 19978	NA	70-79:28095	NA
NA	(Other):105862	NA	80+ :10707	NA

Table continues below
RACE	RELP	EDU	EMPL
AIAN : 4106	In relationship:133394	<8 : 3961	Employed :138489
Asian : 5407	Living apart : 36733	9-12 :69263	Unemployed : 22780
Black : 19836	Never married : 41958	12-15:65214	Occupations: 16620
Hispanic: 17608	Widowed : 20038	16+ :93685	Retired : 54234
White :177976	NA	NA	NA
Others : 7190	NA	NA	NA
NA	NA	NA	NA

Table continues below
INCOME	RENTHOME	COMMUN	ALCpa30
<$20K :33493	Other: 9500	Rural : 34605	0 :100320
$20K-<$35K:42371	Own :163460	Suburban: 37185	1-9 : 84939
$35K-<$50K:31515	Rent : 59163	Urban :160333	10-30: 46864
$50K-<$75K:38889	NA	NA	NA
$75K+ :85855	NA	NA	NA
NA	NA	NA	NA
NA	NA	NA	NA

Table continues below
SMOKESTAT	EXERpa30	SLEPTIM1	GENHLTH
Daily smoker : 25319	No : 50220	Min. : 3.000	Poor : 8402
Someday smoker: 10315	Yes:181903	1st Qu.: 6.000	Fair :26484
Former smoker : 62258	NA	Median : 7.000	Good :70948
Never smoker :134231	NA	Mean : 6.989	Very Good:83001
NA	NA	3rd Qu.: 8.000	Excellent:43288
NA	NA	Max. :12.000	NA
NA	NA	NA	NA

Table continues below
MENTHLTHpa30	PHYSHLTHpa30	HLTHPLN	CHECKUP	MEDICOST
0 :152199	0 :150607	No : 18816	<1 :180709	No :208350
1-9 : 47119	1-9 : 49961	Yes:213307	1-<2: 24684	Yes: 23773
10-30: 32805	10-30: 31555	NA	2-<5: 13149	NA
NA	NA	NA	5+ : 13581	NA
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA

Table continues below
ARTHRITIS	ASTHMA	CANCER	DEPRESSION	DIABETE	KIDNEY
No :162708	No :200840	No :232123	No :189133	No :201941	No :225676
Yes: 69415	Yes: 31283	Yes: 0	Yes: 42990	Yes: 30182	Yes: 6447
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA

Table continues below
PULMOND	STROKE	LASTDENTV	RMVTETH	BMICAT
No :216198	No :224444	<1 :160426	0 :127220	Underweight: 3294
Yes: 15925	Yes: 7679	1-<2: 25272	1-5 : 67111	Normal :69003
NA	NA	2-<5: 21039	>= 6: 24800	Overweight :83349
NA	NA	5+ : 25386	All : 12992	Obese :76477
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA

MICHD
No :215121
Yes: 17002
NA
NA
NA
NA
NA

Yes:

Table continues below
IDAY	MONTH	Region	AGE	GENDER
Min. : 1.0	Oct : 4588	Midwest :12289	18-29: 423	Female:25919
1st Qu.: 7.0	July : 4370	Northeast: 9566	30-39: 1226	Male :21530
Median :14.0	Mar : 4245	South :15232	40-49: 2643	NA
Mean :14.5	Nov : 4238	West :10362	50-59: 7079	NA
3rd Qu.:21.0	Aug : 4226	NA	60-69:13982	NA
Max. :31.0	Sept : 4144	NA	70-79:14693	NA
NA	(Other):21638	NA	80+ : 7403	NA

Table continues below
RACE	RELP	EDU	EMPL
AIAN : 546	In relationship:27619	<8 : 633	Employed :15675
Asian : 263	Living apart : 7998	9-12 :12970	Unemployed : 4705
Black : 1766	Never married : 3191	12-15:13045	Occupations: 1787
Hispanic: 1109	Widowed : 8641	16+ :20801	Retired :25282
White :42785	NA	NA	NA
Others : 980	NA	NA	NA
NA	NA	NA	NA

Table continues below
INCOME	RENTHOME	COMMUN	ALCpa30
<$20K : 6531	Other: 1055	Rural : 7736	0 :22759
$20K-<$35K: 9175	Own :39722	Suburban: 7873	1-9 :14036
$35K-<$50K: 7162	Rent : 6672	Urban :31840	10-30:10654
$50K-<$75K: 8238	NA	NA	NA
$75K+ :16343	NA	NA	NA
NA	NA	NA	NA
NA	NA	NA	NA

Table continues below
SMOKESTAT	EXERpa30	SLEPTIM1	GENHLTH
Daily smoker : 3928	No :12246	Min. : 3.000	Poor : 3787
Someday smoker: 1393	Yes:35203	1st Qu.: 6.000	Fair : 7994
Former smoker :18284	NA	Median : 7.000	Good :15305
Never smoker :23844	NA	Mean : 7.158	Very Good:14825
NA	NA	3rd Qu.: 8.000	Excellent: 5538
NA	NA	Max. :12.000	NA
NA	NA	NA	NA

Table continues below
MENTHLTHpa30	PHYSHLTHpa30	HLTHPLN	CHECKUP	MEDICOST	ARTHRITIS
0 :33279	0 :26807	No : 1247	<1 :42609	No :43986	No :23505
1-9 : 7920	1-9 : 9998	Yes:46202	1-<2: 2891	Yes: 3463	Yes:23944
10-30: 6250	10-30:10644	NA	2-<5: 1026	NA	NA
NA	NA	NA	5+ : 923	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA

Table continues below
ASTHMA	CANCER	DEPRESSION	DIABETE	KIDNEY	PULMOND
No :40072	No : 0	No :37844	No :38442	No :43846	No :41159
Yes: 7377	Yes:47449	Yes: 9605	Yes: 9007	Yes: 3603	Yes: 6290
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA
NA	NA	NA	NA	NA	NA

STROKE	LASTDENTV	RMVTETH	BMICAT	MICHD
No :44103	<1 :35033	0 :19100	Underweight: 716	No :39857
Yes: 3346	1-<2: 3943	1-5 :16015	Normal :14069	Yes: 7592
NA	2-<5: 3450	>= 6: 7968	Overweight :17916	NA
NA	5+ : 5023	All : 4366	Obese :14748	NA
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA
NA	NA	NA	NA	NA

The confident interval does not contain zero == > reject H0 ==> The difference statistically significant (Estimate for difference = 7.14- 6.98 = 0.16 hours= 9.6 minutes)

sleptime_Arthritis <- ggplot(BRFSS, 
                          aes(x = ARTHRITIS, 
                              y = SLEPTIM1,
                              fill = ARTHRITIS)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs(x = "Arthritis status",
      y = "Sleep duration (hours)",
      caption = "Data source: BRFSS2018") 
 
sleptime_Arthritis +  theme_boxplot

sleptime_pulmond <- ggplot(BRFSS, 
                          aes(x = PULMOND, 
                              y = SLEPTIM1,
                              fill = PULMOND)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs(x = "Pulmonary disease status",
      y = "Sleep duration (hours)",
      caption = "Data source: BRFSS2018") 
 

sleptime_pulmond +  theme_boxplot

sleptime_ASTHMA <- ggplot(BRFSS, 
                          aes(x = ASTHMA, 
                              y = SLEPTIM1,
                              fill = ASTHMA)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs(x = "Asthma status",
      y = "Sleep duration (hours)",
      caption = "Data source: BRFSS2018") 
 
sleptime_ASTHMA +  theme_boxplot

sleptime_DEPRESSION <- ggplot(BRFSS, 
                          aes(x = DEPRESSION, 
                              y = SLEPTIM1,
                              fill = DEPRESSION)) +
  geom_boxplot() +
  scale_fill_hue(l = 40, c = 35)+
  labs(x = "Depressive disorder status",
      y = "Sleep duration (hours)",
      caption = "Data source: BRFSS2018") 
 
sleptime_DEPRESSION + theme_boxplot

SLEPTIME and other risk factors (multiple levels)

BRFSS %>% 
  group_by(AGE) %>% 
  dplyr::summarise(count = n(),
              meanslpt =  mean(SLEPTIM1),
            stdslpt = sd(SLEPTIM1))

## # A tibble: 7 x 4
##   AGE   count meanslpt stdslpt
##   <fct> <int>    <dbl>   <dbl>
## 1 18-29 29415     6.92    1.27
## 2 30-39 34450     6.79    1.24
## 3 40-49 38340     6.82    1.25
## 4 50-59 53253     6.87    1.29
## 5 60-69 63216     7.09    1.28
## 6 70-79 42788     7.32    1.28
## 7 80+   18110     7.50    1.40

ggboxplot(BRFSS, 
          x = "AGE", 
          y = "SLEPTIM1",
          fill = "AGE",
          ylab = "Sleep duration (hours)", 
          xlab = "Age group") +
  theme_boxplot

# Compute the analysis of variance
age.aov <- aov(SLEPTIM1 ~ AGE, data = BRFSS)
# Summary of the analysis
summary(age.aov)

##                 Df Sum Sq Mean Sq F value Pr(>F)    
## AGE              6  13090  2181.7    1335 <2e-16 ***
## Residuals   279565 456789     1.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As the p-value is less than the significance level 0.05, we can conclude that there are significant differences between the groups

TukeyHSD(age.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SLEPTIM1 ~ AGE, data = BRFSS)
## 
## $AGE
##                    diff          lwr         upr     p adj
## 30-39-18-29 -0.13357706 -0.163495798 -0.10365832 0.0000000
## 40-49-18-29 -0.09810146 -0.127312774 -0.06889015 0.0000000
## 50-59-18-29 -0.05324172 -0.080619810 -0.02586364 0.0000002
## 60-69-18-29  0.16682431  0.140224952  0.19342366 0.0000000
## 70-79-18-29  0.39362814  0.365083602  0.42217268 0.0000000
## 80+-18-29    0.58132152  0.545724945  0.61691810 0.0000000
## 40-49-30-39  0.03547560  0.007498282  0.06345291 0.0035015
## 50-59-30-39  0.08033534  0.054277917  0.10639276 0.0000000
## 60-69-30-39  0.30040137  0.275163396  0.32563934 0.0000000
## 70-79-30-39  0.52720520  0.499924801  0.55448560 0.0000000
## 80+-30-39    0.71489858  0.680307459  0.74948971 0.0000000
## 50-59-40-49  0.04485974  0.019617732  0.07010175 0.0000033
## 60-69-40-49  0.26492577  0.240530586  0.28932095 0.0000000
## 70-79-40-49  0.49172961  0.465226962  0.51823225 0.0000000
## 80+-40-49    0.67942299  0.645441878  0.71340410 0.0000000
## 60-69-50-59  0.22006603  0.197898848  0.24223321 0.0000000
## 70-79-50-59  0.44686987  0.422402559  0.47133717 0.0000000
## 80+-50-59    0.63456325  0.602144507  0.66698199 0.0000000
## 70-79-60-69  0.22680384  0.203211145  0.25039653 0.0000000
## 80+-60-69    0.41449722  0.382733390  0.44626104 0.0000000
## 80+-70-79    0.18769338  0.154283715  0.22110305 0.0000000

# 1. Homogeneity of variances
plot(age.aov, 1)

In the plot above, there is no evident relationships between residuals and fitted values (the mean of each groups), which is good. So, we can assume the homogeneity of variances.

plot(age.aov, 2)

SLEPTIM and GENDER

ggboxplot(BRFSS, 
          x = "GENDER", 
          y = "SLEPTIM1",
          fill = "GENDER",
          ylab = "Sleep duration (hours)", 
          xlab = "Gender") +
 theme_boxplot

t.test(BRFSS$SLEPTIM1~BRFSS$GENDER)

## 
##  Welch Two Sample t-test
## 
## data:  BRFSS$SLEPTIM1 by BRFSS$GENDER
## t = 7.4379, df = 278554, p-value = 1.026e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.02685357 0.04606973
## sample estimates:
## mean in group Female   mean in group Male 
##             7.035489             6.999027

SLEPTIME and INCOME

ggboxplot(BRFSS, 
          x = "INCOME", 
          y = "SLEPTIM1",
          fill = "INCOME",
          ylab = "Sleep duration (hours)", 
          xlab = "Income range") +
  theme_boxplot

BRFSS %>% 
  group_by(INCOME) %>% 
  dplyr::summarise(count = n(),
              meanslpt =  mean(SLEPTIM1),
            stdslpt = sd(SLEPTIM1)) %>% 
  pander()

INCOME	count	meanslpt	stdslpt
<$20K	40024	6.934	1.678
$20K-<$35K	51546	7.023	1.435
$35K-<$50K	38677	7.042	1.289
$50K-<$75K	47127	7.04	1.193
$75K+	102198	7.03	1.081

# Compute the analysis of variance
income.aov <- aov(SLEPTIM1 ~ INCOME, data = BRFSS)
# Summary of the analysis
summary(income.aov)

##                 Df Sum Sq Mean Sq F value Pr(>F)    
## INCOME           4    342   85.50   50.91 <2e-16 ***
## Residuals   279567 469537    1.68                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0 : There is no difference between group Ha : A least one of them is different p < 0.01 ==> reject the null hypothesis = accept the Ha

TukeyHSD(income.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SLEPTIM1 ~ INCOME, data = BRFSS)
## 
## $INCOME
##                               diff          lwr         upr     p adj
## $20K-<$35K-<$20K       0.088680936  0.065129401 0.112232470 0.0000000
## $35K-<$50K-<$20K       0.107635460  0.082429467 0.132841454 0.0000000
## $50K-<$75K-<$20K       0.105910131  0.081880817 0.129939446 0.0000000
## $75K+-<$20K            0.095806543  0.074961525 0.116651561 0.0000000
## $35K-<$50K-$20K-<$35K  0.018954524 -0.004826748 0.042735797 0.1894985
## $50K-<$75K-$20K-<$35K  0.017229196 -0.005301112 0.039759503 0.2260634
## $75K+-$20K-<$35K       0.007125607 -0.011972078 0.026223293 0.8473075
## $50K-<$75K-$35K-<$50K -0.001725329 -0.025979856 0.022529198 0.9996844
## $75K+-$35K-<$50K      -0.011828917 -0.032933157 0.009275322 0.5435353
## $75K+-$50K-<$75K      -0.010103589 -0.029787459 0.009580282 0.6275639

***RACE and SLEPTIME

BRFSS %>% 
  group_by(RACE) %>% 
  dplyr::summarise(count = n(),
              meanslpt =  mean(SLEPTIM1),
            stdslpt = sd(SLEPTIM1)) %>% 
  pander()

RACE	count	meanslpt	stdslpt
AIAN	4652	6.922	1.545
Asian	5670	6.768	1.175
Black	21602	6.836	1.548
Hispanic	18717	6.92	1.33
White	220761	7.062	1.252
Others	8170	6.779	1.47

ggboxplot(BRFSS, 
          x = "RACE", 
          y = "SLEPTIM1",
          fill = "RACE",
          ylab = "Sleep duration (hours)", 
          xlab = "Race") +
  theme_boxplot

# Compute the analysis of variance
race.aov <- aov(SLEPTIM1 ~ RACE, data = BRFSS)
# Summary of the analysis
summary(race.aov)

##                 Df Sum Sq Mean Sq F value Pr(>F)    
## RACE             5   2179   435.8   260.5 <2e-16 ***
## Residuals   279566 467700     1.7                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(race.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SLEPTIM1 ~ RACE, data = BRFSS)
## 
## $RACE
##                         diff         lwr          upr     p adj
## Asian-AIAN      -0.153500116 -0.22641429 -0.080585946 0.0000000
## Black-AIAN      -0.086137012 -0.14571314 -0.026560881 0.0005409
## Hispanic-AIAN   -0.001895133 -0.06227936  0.058489090 0.9999992
## White-AIAN       0.139760404  0.08515321  0.194367596 0.0000000
## Others-AIAN     -0.142439519 -0.21013948 -0.074739561 0.0000000
## Black-Asian      0.067363104  0.01236327  0.122362938 0.0064160
## Hispanic-Asian   0.151604984  0.09573083  0.207479134 0.0000000
## White-Asian      0.293260520  0.24368622  0.342834824 0.0000000
## Others-Asian     0.011060597 -0.05264930  0.074770491 0.9963733
## Hispanic-Black   0.084241879  0.04743482  0.121048942 0.0000000
## White-Black      0.225897416  0.19962098  0.252173848 0.0000000
## Others-Black    -0.056302507 -0.10417515 -0.008429866 0.0104282
## White-Hispanic   0.141655537  0.11359507  0.169716004 0.0000000
## Others-Hispanic -0.140544386 -0.18941901 -0.091669764 0.0000000
## Others-White    -0.282199923 -0.32372606 -0.240673790 0.0000000

****Attribute and the outcome (MICHD)****

geombar_facet <- geom_bar(aes(y = ..prop.., 
                              fill = factor(..x..)), 
                          stat="count",width = 1)
geomtext_facet <- geom_text(size = 4, 
                            aes(label = scales::percent(..prop..),
                                y= ..prop.. ), 
                            stat= "count", vjust = -.25)

A1 REGION

ggplot(BRFSS, aes(x = MICHD , group = Region)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "peachpuff4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~Region) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

Demographic B1.AGE and MICHD

ggplot(BRFSS, aes(x = MICHD , group = AGE)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ AGE) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

MICHDage <- data.frame(
  age_group = c("18-29","30-39","40-49", "50-59", "60-69","70-79","80+"),
  Percentage = c(0.64, 1.44, 3.10, 6.78, 11.54, 17.64, 23.59))
MICHDage

##   age_group Percentage
## 1     18-29       0.64
## 2     30-39       1.44
## 3     40-49       3.10
## 4     50-59       6.78
## 5     60-69      11.54
## 6     70-79      17.64
## 7       80+      23.59

ggplot(data = MICHDage,
           aes(x = age_group, 
               y  = Percentage)) +
  geom_bar(stat = "identity",
           fill = "royalblue4",
           width =  0.75)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_y_continuous(limits = c(0,25)) +
  theme_minimal() +
  labs(x = "Age group",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

ggplot(data = MICHDage,
           aes(x = age_group, 
               y  = Percentage)) +
  geom_point(stat = "identity",
             position = "identity",
             shape = 21, 
             colour = "blue", 
             fill = "blue", size = 3) + 
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_y_continuous(limits = c(0,25)) +
  theme_minimal() +
  labs(x = "Age group",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

B2.GENDER and MICHD

ggplot(BRFSS, aes(x= MICHD , group = GENDER)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~GENDER) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

table(BRFSS$MICHD,BRFSS$GENDER)

##      
##       Female   Male
##   No  136133 118845
##   Yes   9771  14823

prop.test(table(BRFSS$MICHD, BRFSS$GENDER), correct= FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$MICHD, BRFSS$GENDER)
## X-squared = 1677.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1301941 0.1430238
## sample estimates:
##   prop 1   prop 2 
## 0.533901 0.397292

RACE and MICHD

ggplot(BRFSS, aes(x= MICHD , group = RACE)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~RACE) +
    scale_y_continuous(labels = scales::percent) + 
  themebar_facet

ggplot(BRFSS, aes(x= MICHD , group = EDU)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ EDU) +
    scale_y_continuous(labels = scales::percent) +
 themebar_facet

MICHDedu <- data.frame(
  edu_group = c("<8","9-12","12-15", "16+"),
  Percentage = c(16.2, 11.2, 9.2, 6.5))
MICHDedu

##   edu_group Percentage
## 1        <8       16.2
## 2      9-12       11.2
## 3     12-15        9.2
## 4       16+        6.5

positions <- c("<8","9-12","12-15", "16+")

ggplot(data = MICHDedu ,
           aes(x = positions, 
               y  = Percentage)) +
  geom_bar(stat = "identity",
           fill = "royalblue4",
           width = 0.6)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_discrete(limits = positions) +
  scale_y_continuous(limits = c(0,20)) +
  theme_minimal() +
  labs(x = "Years of education categories",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

ggplot(BRFSS, aes(x= MICHD , group = RELP)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ RELP) +
    scale_y_continuous(labels = scales::percent) +
 themebar_facet

AGE and widowed

BRFSS %>% 
  group_by(AGE,RELP) %>% 
  dplyr::count()

## # A tibble: 28 x 3
## # Groups:   AGE, RELP [28]
##    AGE   RELP                n
##    <fct> <ord>           <int>
##  1 18-29 In relationship  9521
##  2 18-29 Living apart      991
##  3 18-29 Never married   18861
##  4 18-29 Widowed            42
##  5 30-39 In relationship 21906
##  6 30-39 Living apart     4101
##  7 30-39 Never married    8267
##  8 30-39 Widowed           176
##  9 40-49 In relationship 26044
## 10 40-49 Living apart     6995
## # ... with 18 more rows

ggplot(BRFSS, aes(x= AGE, group = RELP)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat="count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col7) +
    labs(y = "Percent of respondents", fill = "Age group") +
    facet_grid(~ RELP) +
    scale_y_continuous(labels = scales::percent) +
  theme_bar

B6. EMPL

ggplot(BRFSS, aes(x = MICHD , group = EMPL)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ EMPL) +
  themebar_facet

ggplot(BRFSS, aes(x= AGE, group = EMPL)) + 
    geom_bar(aes(y = ..prop.., 
                 fill = factor(..x..)), 
             stat = "count",
             width = 0.8) +
    geom_text(size = 4, 
              aes(label = scales::percent(..prop..),
                  y= ..prop.. ), 
              stat= "count",
              color = "blue",
              vjust = -.25) +
  scale_fill_manual(values= col7) +
    labs(y = "Percent of respondents", fill = "Age group") +
    facet_grid(~EMPL) +
    scale_y_continuous(labels = scales::percent) +
  theme_bar

B7. Income levels (INCOME)

ggplot(BRFSS, aes(x= MICHD , group = INCOME)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ INCOME) +
    scale_y_continuous(labels = scales::percent) + 
  themebar_facet

MICHDincome <- data.frame(
  edu_group = c("<$20K","$20K-<$35K","$35K-<$50K", "$50K-<$75K","$75K+"),
  Percentage = c(14.9, 11.5,9.5, 7.8, 5.3))
MICHDincome

##    edu_group Percentage
## 1      <$20K       14.9
## 2 $20K-<$35K       11.5
## 3 $35K-<$50K        9.5
## 4 $50K-<$75K        7.8
## 5      $75K+        5.3

position_edu <- c("<$20K","$20K-<$35K","$35K-<$50K", "$50K-<$75K","$75K+")

ggplot(data = MICHDincome,
           aes(x = position_edu, 
               y = Percentage)) +
  geom_bar(stat = "identity",
           fill = "royalblue4",
           width = 0.6)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_discrete(limits = position_edu) +
  scale_y_continuous(limits = c(0,18)) +
  theme_minimal() +
  labs(x = "Income levels",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

B8.Geographic locale

ggplot(BRFSS, aes(x= MICHD , group = COMMUN)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "royalblue4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ COMMUN) +
    scale_y_continuous(labels = scales::percent) + 
  themebar_facet

Lifestyle (C) C1. X_SMOKER3 Four-level smoker status —> SMOKESTAT C2. ALCDAY5 Days in past 30 had alcoholic beverage —> ALCpa30 C3. SLEPTIM1 How Much Time Do You Sleep C4. EXERANY2 Exercise in Past 30 Days

C1. X_SMOKER3 Four-level smoker status —> SMOKESTAT

ggplot(BRFSS, aes(x = MICHD , group = SMOKESTAT)) + 
    geombar_facet +
    geomtext_facet  +
  scale_fill_manual(values = c("gainsboro", "olivedrab4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ SMOKESTAT) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$SMOKESTAT, BRFSS$MICHD), correct=FALSE)

## 
##  4-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$SMOKESTAT, BRFSS$MICHD)
## X-squared = 3786.7, df = 3, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4 
## 0.8933224 0.9036556 0.8658091 0.9396616

C2. ALCDAY5 Days in past 30 had alcoholic beverage —> ALCpa30

ggplot(BRFSS, aes(x = MICHD , group = ALCpa30)) + 
    geombar_facet +
    geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "olivedrab4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ ALCpa30) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$ALCpa30, BRFSS$MICHD), correct = FALSE)

## 
##  3-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$ALCpa30, BRFSS$MICHD)
## X-squared = 2257.3, df = 2, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.8833757 0.9360242 0.9320561

C3. SLEPTIM1 How Much Time Do You Sleep

ggplot(BRFSS, aes(x = MICHD , group = SLEPTIM1)) + 
   geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "olivedrab4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ SLEPTIM1) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$SLEPTIM1, BRFSS$MICHD), correct = FALSE)

## 
##  10-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$SLEPTIM1, BRFSS$MICHD)
## X-squared = 2095.6, df = 9, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4    prop 5    prop 6    prop 7    prop 8 
## 0.8150538 0.8381890 0.8964931 0.9178640 0.9332331 0.9088374 0.8906505 0.8343991 
##    prop 9   prop 10 
## 0.8532609 0.8106769

Sleep <- data.frame(
  Duration =  c(3:12),
  Percentage = c(18.49, 16.18, 10.35, 8.21, 6.68, 9.12, 10.93, 16.56, 14.67, 18.93)
)

Sleep

##    Duration Percentage
## 1         3      18.49
## 2         4      16.18
## 3         5      10.35
## 4         6       8.21
## 5         7       6.68
## 6         8       9.12
## 7         9      10.93
## 8        10      16.56
## 9        11      14.67
## 10       12      18.93

ggplot(data = Sleep, aes(x = Duration,
                        y = Percentage)) +
  geom_line(linetype = "dashed", 
            size = 0.5) +
  geom_point(size = 6, color = "blue") +
  
  labs(y = "Percent of respondents (%)", 
       x = "Duration (h)") +
  theme_point

ggplot(data = Sleep,
           aes(x = Duration, 
               y  = Percentage,
               fill = Percentage)) +
  scale_fill_gradient(low = "olivedrab4", high = "darkred")+
  geom_bar(stat = "identity")+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_continuous(breaks = seq(0,14,by = 1))+
  scale_y_continuous(limits = c(0,20)) +
  theme_minimal() +
  labs(x = "Sleep duration (h)",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

C4. EXERANY2 Exercise in Past 30 Days

ggplot(BRFSS, aes(x= MICHD , group = EXERpa30)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "olivedrab4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ EXERpa30) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$EXERpa30, BRFSS$MICHD), correct=FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$EXERpa30, BRFSS$MICHD)
## X-squared = 2775.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.07069454 -0.06482085
## sample estimates:
##    prop 1    prop 2 
## 0.8594115 0.9271692

Health Status (D) D1. GENHLTH

ggplot(BRFSS, aes(x= MICHD , group = GENHLTH)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "orangered4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ GENHLTH) +
    scale_y_continuous(labels = scales::percent)+ 
  themebar_facet

prop.test(table(BRFSS$GENHLTH, BRFSS$MICHD), correct = FALSE)

## 
##  5-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$GENHLTH, BRFSS$MICHD)
## X-squared = 18845, df = 4, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4    prop 5 
## 0.6677332 0.8065433 0.9026237 0.9550222 0.9779830

Sleep <- data.frame(
  Duration =  c(3:12),
  Percentage = c(18.49, 16.18, 10.35, 8.21, 6.68, 9.12, 10.93, 16.56, 14.67, 18.93)
)

Sleep

##    Duration Percentage
## 1         3      18.49
## 2         4      16.18
## 3         5      10.35
## 4         6       8.21
## 5         7       6.68
## 6         8       9.12
## 7         9      10.93
## 8        10      16.56
## 9        11      14.67
## 10       12      18.93

MICHD_GENHLTH <- data.frame(
  GENHLTH_group = c("Poor","Fair","Good", "Very good","Excellent"),
  Percentage = c(33.2,19.3,9.7, 4.5, 2.2))


position_GENHLTH <- c("Poor","Fair","Good", "Very good","Excellent")

ggplot(data = MICHD_GENHLTH,
           aes(x = position_GENHLTH, 
               y = Percentage)) +
  geom_bar(stat = "identity",
           fill = "purple4",
           width = 0.6)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_discrete(limits = position_GENHLTH) +
  scale_y_continuous(limits = c(0,35)) +
  theme_minimal() +
  labs(x = "Would you say that in general your health is:...",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

Healthy Days (E)— Health Related Quality of Life E1.PHYSHLTH (Number of Days Physical Health Not Good) E2.MENTHLTH (Number of Days Mental Health Not Good)

ggplot(BRFSS, aes(x= MICHD , group = PHYSHLTHpa30)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "purple4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ PHYSHLTHpa30) +
    scale_y_continuous(labels = scales::percent)+
 themebar_facet

prop.test(table(BRFSS$PHYSHLTHpa30, BRFSS$MICHD), correct = FALSE)

## 
##  3-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$PHYSHLTHpa30, BRFSS$MICHD)
## X-squared = 8082.2, df = 2, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.9381334 0.9134742 0.8002322

ggplot(BRFSS, aes(x= MICHD , group = MENTHLTHpa30)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "purple4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ MENTHLTHpa30) +
    scale_y_continuous(labels = scales::percent)+
 themebar_facet

prop.test(table(BRFSS$MENTHLTHpa30, BRFSS$MICHD), correct = FALSE)

## 
##  3-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$MENTHLTHpa30, BRFSS$MICHD)
## X-squared = 661.24, df = 2, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.9128522 0.9303403 0.8823198

Health Care Access (F) F1. HPLN1 (Have any health care coverage) ==> HLTHPLN F2. MEDCOST (Could Not See Doctor Because of Cost) ==> MEDICOST F3. CHECKUP1 ==> CHECKUP

ggplot(BRFSS, aes(x= MICHD , group = HLTHPLN)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ HLTHPLN) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

ggplot(BRFSS, aes(x= MICHD , group = MEDICOST)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ MEDICOST) +
    scale_y_continuous(labels = scales::percent)+
themebar_facet

ggplot(BRFSS, aes(x= MICHD , group = CHECKUP)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ CHECKUP) +
    scale_y_continuous(labels = scales::percent)+
themebar_facet

BRFSS %>% 
  group_by(CHECKUP, MICHD) %>% 
  dplyr::count()

## # A tibble: 8 x 3
## # Groups:   CHECKUP, MICHD [8]
##   CHECKUP MICHD      n
##   <fct>   <fct>  <int>
## 1 <1      No    200580
## 2 <1      Yes    22738
## 3 1-<2    No     26573
## 4 1-<2    Yes     1002
## 5 2-<5    No     13765
## 6 2-<5    Yes      410
## 7 5+      No     14060
## 8 5+      Yes      444

Chronic Health Conditions (G) G1. ASTHMA3 ==> ASTHMA G2. CHCCOPD1 ==> PULMOND G3. HAVARTH3 ==> ARTHRITIS G4. ADDEPEV2 ==> DEPRESSION G5. CHCKDNY1 ==> KIDNEY G6. DIABETE3 ==> DIABETE G7. CHCSCNCR,CHCOCNCR ==> CANCER

BRFSS %>% 
  select(ASTHMA, PULMOND,ARTHRITIS,DEPRESSION, KIDNEY,DIABETE,CANCER, MICHD) %>% 
  group_by (MICHD) %>% 
  count()

##     ASTHMA PULMOND ARTHRITIS DEPRESSION KIDNEY DIABETE CANCER MICHD   freq
## 1       No      No        No         No     No      No     No    No 106889
## 2       No      No        No         No     No      No     No   Yes   3328
## 3       No      No        No         No     No      No    Yes    No  12899
## 4       No      No        No         No     No      No    Yes   Yes   1226
## 5       No      No        No         No     No     Yes     No    No   9088
## 6       No      No        No         No     No     Yes     No   Yes   1161
## 7       No      No        No         No     No     Yes    Yes    No   1689
## 8       No      No        No         No     No     Yes    Yes   Yes    378
## 9       No      No        No         No    Yes      No     No    No    904
## 10      No      No        No         No    Yes      No     No   Yes    128
## 11      No      No        No         No    Yes      No    Yes    No    442
## 12      No      No        No         No    Yes      No    Yes   Yes    100
## 13      No      No        No         No    Yes     Yes     No    No    401
## 14      No      No        No         No    Yes     Yes     No   Yes    165
## 15      No      No        No         No    Yes     Yes    Yes    No    155
## 16      No      No        No         No    Yes     Yes    Yes   Yes     73
## 17      No      No        No        Yes     No      No     No    No  15285
## 18      No      No        No        Yes     No      No     No   Yes    509
## 19      No      No        No        Yes     No      No    Yes    No   1812
## 20      No      No        No        Yes     No      No    Yes   Yes    166
## 21      No      No        No        Yes     No     Yes     No    No   1574
## 22      No      No        No        Yes     No     Yes     No   Yes    221
## 23      No      No        No        Yes     No     Yes    Yes    No    297
## 24      No      No        No        Yes     No     Yes    Yes   Yes     55
## 25      No      No        No        Yes    Yes      No     No    No    217
## 26      No      No        No        Yes    Yes      No     No   Yes     36
## 27      No      No        No        Yes    Yes      No    Yes    No     85
## 28      No      No        No        Yes    Yes      No    Yes   Yes     15
## 29      No      No        No        Yes    Yes     Yes     No    No     97
## 30      No      No        No        Yes    Yes     Yes     No   Yes     40
## 31      No      No        No        Yes    Yes     Yes    Yes    No     34
## 32      No      No        No        Yes    Yes     Yes    Yes   Yes     13
## 33      No      No       Yes         No     No      No     No    No  28413
## 34      No      No       Yes         No     No      No     No   Yes   2730
## 35      No      No       Yes         No     No      No    Yes    No   8670
## 36      No      No       Yes         No     No      No    Yes   Yes   1318
## 37      No      No       Yes         No     No     Yes     No    No   5295
## 38      No      No       Yes         No     No     Yes     No   Yes   1207
## 39      No      No       Yes         No     No     Yes    Yes    No   1748
## 40      No      No       Yes         No     No     Yes    Yes   Yes    503
## 41      No      No       Yes         No    Yes      No     No    No    770
## 42      No      No       Yes         No    Yes      No     No   Yes    201
## 43      No      No       Yes         No    Yes      No    Yes    No    475
## 44      No      No       Yes         No    Yes      No    Yes   Yes    182
## 45      No      No       Yes         No    Yes     Yes     No    No    488
## 46      No      No       Yes         No    Yes     Yes     No   Yes    282
## 47      No      No       Yes         No    Yes     Yes    Yes    No    266
## 48      No      No       Yes         No    Yes     Yes    Yes   Yes    167
## 49      No      No       Yes        Yes     No      No     No    No   8089
## 50      No      No       Yes        Yes     No      No     No   Yes    762
## 51      No      No       Yes        Yes     No      No    Yes    No   2162
## 52      No      No       Yes        Yes     No      No    Yes   Yes    333
## 53      No      No       Yes        Yes     No     Yes     No    No   1826
## 54      No      No       Yes        Yes     No     Yes     No   Yes    458
## 55      No      No       Yes        Yes     No     Yes    Yes    No    523
## 56      No      No       Yes        Yes     No     Yes    Yes   Yes    170
## 57      No      No       Yes        Yes    Yes      No     No    No    320
## 58      No      No       Yes        Yes    Yes      No     No   Yes     81
## 59      No      No       Yes        Yes    Yes      No    Yes    No    165
## 60      No      No       Yes        Yes    Yes      No    Yes   Yes     60
## 61      No      No       Yes        Yes    Yes     Yes     No    No    210
## 62      No      No       Yes        Yes    Yes     Yes     No   Yes    100
## 63      No      No       Yes        Yes    Yes     Yes    Yes    No     85
## 64      No      No       Yes        Yes    Yes     Yes    Yes   Yes     52
## 65      No     Yes        No         No     No      No     No    No   1958
## 66      No     Yes        No         No     No      No     No   Yes    438
## 67      No     Yes        No         No     No      No    Yes    No    612
## 68      No     Yes        No         No     No      No    Yes   Yes    165
## 69      No     Yes        No         No     No     Yes     No    No    346
## 70      No     Yes        No         No     No     Yes     No   Yes    157
## 71      No     Yes        No         No     No     Yes    Yes    No    127
## 72      No     Yes        No         No     No     Yes    Yes   Yes     59
## 73      No     Yes        No         No    Yes      No     No    No     33
## 74      No     Yes        No         No    Yes      No     No   Yes     23
## 75      No     Yes        No         No    Yes      No    Yes    No     41
## 76      No     Yes        No         No    Yes      No    Yes   Yes     29
## 77      No     Yes        No         No    Yes     Yes     No    No     24
## 78      No     Yes        No         No    Yes     Yes     No   Yes     40
## 79      No     Yes        No         No    Yes     Yes    Yes    No     13
## 80      No     Yes        No         No    Yes     Yes    Yes   Yes     13
## 81      No     Yes        No        Yes     No      No     No    No    615
## 82      No     Yes        No        Yes     No      No     No   Yes    117
## 83      No     Yes        No        Yes     No      No    Yes    No    157
## 84      No     Yes        No        Yes     No      No    Yes   Yes     28
## 85      No     Yes        No        Yes     No     Yes     No    No    116
## 86      No     Yes        No        Yes     No     Yes     No   Yes     57
## 87      No     Yes        No        Yes     No     Yes    Yes    No     35
## 88      No     Yes        No        Yes     No     Yes    Yes   Yes     26
## 89      No     Yes        No        Yes    Yes      No     No    No     25
## 90      No     Yes        No        Yes    Yes      No     No   Yes     10
## 91      No     Yes        No        Yes    Yes      No    Yes    No     11
## 92      No     Yes        No        Yes    Yes      No    Yes   Yes      8
## 93      No     Yes        No        Yes    Yes     Yes     No    No     10
## 94      No     Yes        No        Yes    Yes     Yes     No   Yes     14
## 95      No     Yes        No        Yes    Yes     Yes    Yes    No      1
## 96      No     Yes        No        Yes    Yes     Yes    Yes   Yes      5
## 97      No     Yes       Yes         No     No      No     No    No   1926
## 98      No     Yes       Yes         No     No      No     No   Yes    529
## 99      No     Yes       Yes         No     No      No    Yes    No    742
## 100     No     Yes       Yes         No     No      No    Yes   Yes    321
## 101     No     Yes       Yes         No     No     Yes     No    No    487
## 102     No     Yes       Yes         No     No     Yes     No   Yes    278
## 103     No     Yes       Yes         No     No     Yes    Yes    No    186
## 104     No     Yes       Yes         No     No     Yes    Yes   Yes    140
## 105     No     Yes       Yes         No    Yes      No     No    No     76
## 106     No     Yes       Yes         No    Yes      No     No   Yes     55
## 107     No     Yes       Yes         No    Yes      No    Yes    No     67
## 108     No     Yes       Yes         No    Yes      No    Yes   Yes     52
## 109     No     Yes       Yes         No    Yes     Yes     No    No     53
## 110     No     Yes       Yes         No    Yes     Yes     No   Yes     89
## 111     No     Yes       Yes         No    Yes     Yes    Yes    No     33
## 112     No     Yes       Yes         No    Yes     Yes    Yes   Yes     56
## 113     No     Yes       Yes        Yes     No      No     No    No   1044
## 114     No     Yes       Yes        Yes     No      No     No   Yes    326
## 115     No     Yes       Yes        Yes     No      No    Yes    No    374
## 116     No     Yes       Yes        Yes     No      No    Yes   Yes    142
## 117     No     Yes       Yes        Yes     No     Yes     No    No    310
## 118     No     Yes       Yes        Yes     No     Yes     No   Yes    190
## 119     No     Yes       Yes        Yes     No     Yes    Yes    No    100
## 120     No     Yes       Yes        Yes     No     Yes    Yes   Yes     82
## 121     No     Yes       Yes        Yes    Yes      No     No    No     72
## 122     No     Yes       Yes        Yes    Yes      No     No   Yes     40
## 123     No     Yes       Yes        Yes    Yes      No    Yes    No     37
## 124     No     Yes       Yes        Yes    Yes      No    Yes   Yes     37
## 125     No     Yes       Yes        Yes    Yes     Yes     No    No     44
## 126     No     Yes       Yes        Yes    Yes     Yes     No   Yes     63
## 127     No     Yes       Yes        Yes    Yes     Yes    Yes    No     19
## 128     No     Yes       Yes        Yes    Yes     Yes    Yes   Yes     36
## 129    Yes      No        No         No     No      No     No    No  11164
## 130    Yes      No        No         No     No      No     No   Yes    287
## 131    Yes      No        No         No     No      No    Yes    No   1200
## 132    Yes      No        No         No     No      No    Yes   Yes    104
## 133    Yes      No        No         No     No     Yes     No    No    879
## 134    Yes      No        No         No     No     Yes     No   Yes    113
## 135    Yes      No        No         No     No     Yes    Yes    No    185
## 136    Yes      No        No         No     No     Yes    Yes   Yes     46
## 137    Yes      No        No         No    Yes      No     No    No     94
## 138    Yes      No        No         No    Yes      No     No   Yes     15
## 139    Yes      No        No         No    Yes      No    Yes    No     48
## 140    Yes      No        No         No    Yes      No    Yes   Yes      8
## 141    Yes      No        No         No    Yes     Yes     No    No     35
## 142    Yes      No        No         No    Yes     Yes     No   Yes     15
## 143    Yes      No        No         No    Yes     Yes    Yes    No     13
## 144    Yes      No        No         No    Yes     Yes    Yes   Yes      7
## 145    Yes      No        No        Yes     No      No     No    No   3372
## 146    Yes      No        No        Yes     No      No     No   Yes    111
## 147    Yes      No        No        Yes     No      No    Yes    No    345
## 148    Yes      No        No        Yes     No      No    Yes   Yes     28
## 149    Yes      No        No        Yes     No     Yes     No    No    358
## 150    Yes      No        No        Yes     No     Yes     No   Yes     43
## 151    Yes      No        No        Yes     No     Yes    Yes    No     58
## 152    Yes      No        No        Yes     No     Yes    Yes   Yes     15
## 153    Yes      No        No        Yes    Yes      No     No    No     52
## 154    Yes      No        No        Yes    Yes      No     No   Yes      4
## 155    Yes      No        No        Yes    Yes      No    Yes    No     16
## 156    Yes      No        No        Yes    Yes      No    Yes   Yes      7
## 157    Yes      No        No        Yes    Yes     Yes     No    No     19
## 158    Yes      No        No        Yes    Yes     Yes     No   Yes      8
## 159    Yes      No        No        Yes    Yes     Yes    Yes    No     11
## 160    Yes      No        No        Yes    Yes     Yes    Yes   Yes      4
## 161    Yes      No       Yes         No     No      No     No    No   3562
## 162    Yes      No       Yes         No     No      No     No   Yes    341
## 163    Yes      No       Yes         No     No      No    Yes    No   1135
## 164    Yes      No       Yes         No     No      No    Yes   Yes    169
## 165    Yes      No       Yes         No     No     Yes     No    No    784
## 166    Yes      No       Yes         No     No     Yes     No   Yes    192
## 167    Yes      No       Yes         No     No     Yes    Yes    No    280
## 168    Yes      No       Yes         No     No     Yes    Yes   Yes     86
## 169    Yes      No       Yes         No    Yes      No     No    No    120
## 170    Yes      No       Yes         No    Yes      No     No   Yes     28
## 171    Yes      No       Yes         No    Yes      No    Yes    No     70
## 172    Yes      No       Yes         No    Yes      No    Yes   Yes     24
## 173    Yes      No       Yes         No    Yes     Yes     No    No     79
## 174    Yes      No       Yes         No    Yes     Yes     No   Yes     36
## 175    Yes      No       Yes         No    Yes     Yes    Yes    No     35
## 176    Yes      No       Yes         No    Yes     Yes    Yes   Yes     24
## 177    Yes      No       Yes        Yes     No      No     No    No   2102
## 178    Yes      No       Yes        Yes     No      No     No   Yes    189
## 179    Yes      No       Yes        Yes     No      No    Yes    No    511
## 180    Yes      No       Yes        Yes     No      No    Yes   Yes     94
## 181    Yes      No       Yes        Yes     No     Yes     No    No    566
## 182    Yes      No       Yes        Yes     No     Yes     No   Yes    135
## 183    Yes      No       Yes        Yes     No     Yes    Yes    No    145
## 184    Yes      No       Yes        Yes     No     Yes    Yes   Yes     60
## 185    Yes      No       Yes        Yes    Yes      No     No    No     81
## 186    Yes      No       Yes        Yes    Yes      No     No   Yes     22
## 187    Yes      No       Yes        Yes    Yes      No    Yes    No     47
## 188    Yes      No       Yes        Yes    Yes      No    Yes   Yes     20
## 189    Yes      No       Yes        Yes    Yes     Yes     No    No     78
## 190    Yes      No       Yes        Yes    Yes     Yes     No   Yes     39
## 191    Yes      No       Yes        Yes    Yes     Yes    Yes    No     26
## 192    Yes      No       Yes        Yes    Yes     Yes    Yes   Yes     20
## 193    Yes     Yes        No         No     No      No     No    No    921
## 194    Yes     Yes        No         No     No      No     No   Yes    131
## 195    Yes     Yes        No         No     No      No    Yes    No    236
## 196    Yes     Yes        No         No     No      No    Yes   Yes     73
## 197    Yes     Yes        No         No     No     Yes     No    No    159
## 198    Yes     Yes        No         No     No     Yes     No   Yes     73
## 199    Yes     Yes        No         No     No     Yes    Yes    No     68
## 200    Yes     Yes        No         No     No     Yes    Yes   Yes     32
## 201    Yes     Yes        No         No    Yes      No     No    No     33
## 202    Yes     Yes        No         No    Yes      No     No   Yes     12
## 203    Yes     Yes        No         No    Yes      No    Yes    No     12
## 204    Yes     Yes        No         No    Yes      No    Yes   Yes      8
## 205    Yes     Yes        No         No    Yes     Yes     No    No      8
## 206    Yes     Yes        No         No    Yes     Yes     No   Yes     14
## 207    Yes     Yes        No         No    Yes     Yes    Yes    No      4
## 208    Yes     Yes        No         No    Yes     Yes    Yes   Yes      9
## 209    Yes     Yes        No        Yes     No      No     No    No    462
## 210    Yes     Yes        No        Yes     No      No     No   Yes     89
## 211    Yes     Yes        No        Yes     No      No    Yes    No     93
## 212    Yes     Yes        No        Yes     No      No    Yes   Yes     29
## 213    Yes     Yes        No        Yes     No     Yes     No    No    123
## 214    Yes     Yes        No        Yes     No     Yes     No   Yes     44
## 215    Yes     Yes        No        Yes     No     Yes    Yes    No     29
## 216    Yes     Yes        No        Yes     No     Yes    Yes   Yes     16
## 217    Yes     Yes        No        Yes    Yes      No     No    No     16
## 218    Yes     Yes        No        Yes    Yes      No     No   Yes      9
## 219    Yes     Yes        No        Yes    Yes      No    Yes    No     11
## 220    Yes     Yes        No        Yes    Yes      No    Yes   Yes      3
## 221    Yes     Yes        No        Yes    Yes     Yes     No    No      9
## 222    Yes     Yes        No        Yes    Yes     Yes     No   Yes     10
## 223    Yes     Yes        No        Yes    Yes     Yes    Yes    No     11
## 224    Yes     Yes        No        Yes    Yes     Yes    Yes   Yes      7
## 225    Yes     Yes       Yes         No     No      No     No    No   1017
## 226    Yes     Yes       Yes         No     No      No     No   Yes    275
## 227    Yes     Yes       Yes         No     No      No    Yes    No    445
## 228    Yes     Yes       Yes         No     No      No    Yes   Yes    154
## 229    Yes     Yes       Yes         No     No     Yes     No    No    374
## 230    Yes     Yes       Yes         No     No     Yes     No   Yes    192
## 231    Yes     Yes       Yes         No     No     Yes    Yes    No    153
## 232    Yes     Yes       Yes         No     No     Yes    Yes   Yes    112
## 233    Yes     Yes       Yes         No    Yes      No     No    No     62
## 234    Yes     Yes       Yes         No    Yes      No     No   Yes     38
## 235    Yes     Yes       Yes         No    Yes      No    Yes    No     45
## 236    Yes     Yes       Yes         No    Yes      No    Yes   Yes     32
## 237    Yes     Yes       Yes         No    Yes     Yes     No    No     54
## 238    Yes     Yes       Yes         No    Yes     Yes     No   Yes     64
## 239    Yes     Yes       Yes         No    Yes     Yes    Yes    No     35
## 240    Yes     Yes       Yes         No    Yes     Yes    Yes   Yes     45
## 241    Yes     Yes       Yes        Yes     No      No     No    No    976
## 242    Yes     Yes       Yes        Yes     No      No     No   Yes    280
## 243    Yes     Yes       Yes        Yes     No      No    Yes    No    306
## 244    Yes     Yes       Yes        Yes     No      No    Yes   Yes    149
## 245    Yes     Yes       Yes        Yes     No     Yes     No    No    403
## 246    Yes     Yes       Yes        Yes     No     Yes     No   Yes    230
## 247    Yes     Yes       Yes        Yes     No     Yes    Yes    No    143
## 248    Yes     Yes       Yes        Yes     No     Yes    Yes   Yes    102
## 249    Yes     Yes       Yes        Yes    Yes      No     No    No     78
## 250    Yes     Yes       Yes        Yes    Yes      No     No   Yes     49
## 251    Yes     Yes       Yes        Yes    Yes      No    Yes    No     44
## 252    Yes     Yes       Yes        Yes    Yes      No    Yes   Yes     43
## 253    Yes     Yes       Yes        Yes    Yes     Yes     No    No     76
## 254    Yes     Yes       Yes        Yes    Yes     Yes     No   Yes     79
## 255    Yes     Yes       Yes        Yes    Yes     Yes    Yes    No     35
## 256    Yes     Yes       Yes        Yes    Yes     Yes    Yes   Yes     52

G1. ASTHMA

table1 <- table(BRFSS$ASTHMA, BRFSS$MICHD)
table1

##      
##           No    Yes
##   No  221067  19845
##   Yes  33911   4749

ggplot(BRFSS, aes(x = MICHD , group = ASTHMA)) + 
  geombar_facet +
  geomtext_facet  +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ ASTHMA) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$ASTHMA, BRFSS$MICHD), correct = FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$ASTHMA, BRFSS$MICHD)
## X-squared = 679.92, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.03701430 0.04391704
## sample estimates:
##    prop 1    prop 2 
## 0.9176255 0.8771599

G2. PULMOND

ggplot(BRFSS, aes(x= MICHD , group = PULMOND)) + 
  geombar_facet +
  geomtext_facet  +
  scale_fill_manual(values= c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ PULMOND) +
  themebar_facet

prop.test(table(BRFSS$PULMOND, BRFSS$MICHD), correct = FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$PULMOND, BRFSS$MICHD)
## X-squared = 10375, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1958027 0.2076973
## sample estimates:
##    prop 1    prop 2 
## 0.9280610 0.7263111

G3. ARTHRITIS

ggplot(BRFSS, aes(x = MICHD , group = ARTHRITIS)) + 
  geombar_facet +
  geomtext_facet  +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ ARTHRITIS) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$ARTHRITIS, BRFSS$MICHD), correct=FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$ARTHRITIS, BRFSS$MICHD)
## X-squared = 7715.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.09723545 0.10231039
## sample estimates:
##    prop 1    prop 2 
## 0.9453475 0.8455746

G4. DEPRESSION

ggplot(BRFSS, aes(x = MICHD, group = DEPRESSION)) + 
  geombar_facet +
  geomtext_facet  +
  scale_fill_manual(values= c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ DEPRESSION) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$DEPRESSION, BRFSS$MICHD), correct = FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$DEPRESSION, BRFSS$MICHD)
## X-squared = 761.52, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.03484365 0.04080930
## sample estimates:
##    prop 1    prop 2 
## 0.9191460 0.8813195

G5.KIDNEY

ggplot(BRFSS, aes(x = MICHD , group = KIDNEY)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ KIDNEY) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$KIDNEY, BRFSS$MICHD), correct=FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$KIDNEY, BRFSS$MICHD)
## X-squared = 5868.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.2114302 0.2294740
## sample estimates:
##    prop 1    prop 2 
## 0.9199546 0.6995025

G6. DIABETE3 ==> DIABETE

ggplot(BRFSS, aes(x = MICHD , group = DIABETE)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ DIABETE) +
  themebar_facet

prop.test(table(BRFSS$DIABETE, BRFSS$MICHD), correct = FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$DIABETE, BRFSS$MICHD)
## X-squared = 8616.9, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1390752 0.1474029
## sample estimates:
##    prop 1    prop 2 
## 0.9321083 0.7888693

G7. CHCSCNCR,CHCOCNCR ==> CANCER

ggplot(BRFSS, aes(x = MICHD , group = CANCER)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ CANCER) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$CANCER, BRFSS$MICHD), correct = FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$CANCER, BRFSS$MICHD)
## X-squared = 3695.9, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.08329296 0.09022248
## sample estimates:
##    prop 1    prop 2 
## 0.9267544 0.8399966

ggplot(BRFSS, aes(x = MICHD , group = STROKE)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "goldenrod4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ STROKE) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

Among pp who have MICHD

CHC <- c("ARTHRITIS", "ASHMA", " CANCER","DEPRESSION", "DIABETE","KIDNEY", "PULMON", "STROKE")
CHC_NO <- c(7.0,9.6,8.7,9.4,8.0,9.0,8.0,8.0)
CHC_YES <- c(17,14.9,17.9,14.3,23,32,30,36.0)

dataCHC <- cbind.data.frame(CHC, CHC_NO, CHC_YES)
dataCHC

##          CHC CHC_NO CHC_YES
## 1  ARTHRITIS    7.0    17.0
## 2      ASHMA    9.6    14.9
## 3     CANCER    8.7    17.9
## 4 DEPRESSION    9.4    14.3
## 5    DIABETE    8.0    23.0
## 6     KIDNEY    9.0    32.0
## 7     PULMON    8.0    30.0
## 8     STROKE    8.0    36.0

library(ggthemes)

## 
## Attaching package: 'ggthemes'

## The following object is masked from 'package:cowplot':
## 
##     theme_map

CHC_MICHD <- read.csv(file="C:/Users/user/Desktop/Capstone Project/Submission/R codes/Chronic health.csv", header=TRUE, sep=",")

CHC_MICHD

##    ï..ChronicHLTH CHC.status MICHD.NO MICHD.YES
## 1          ASTHMA         NO     90.4       9.6
## 2          ASTHMA        YES     85.1      14.9
## 3         PULMOND         NO     92.0       8.0
## 4         PULMOND        YES     70.0      30.0
## 5       ARTHRITIS         NO     93.0       7.0
## 6       ARTHRITIS        YES     83.0      17.0
## 7      DEPRESSION         NO     90.6       9.4
## 8      DEPRESSION        YES     85.7      14.3
## 9          KIDNEY         NO     91.0       9.0
## 10         KIDNEY        YES     68.0      32.0
## 11        DIABETE         NO     92.0       8.0
## 12        DIABETE        YES     77.0      23.0
## 13         CANCER         NO     91.3       8.7
## 14         CANCER        YES     82.1      17.9
## 15         STROKE         NO     92.0       8.0
## 16         STROKE        YES     64.0      36.0

ggplot(CHC_MICHD, aes(CHC.status, MICHD.YES)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ï..ChronicHLTH, nrow = 2) +
  geom_text(aes(label = paste0(MICHD.YES, "%"), 
                y = MICHD.YES),
            vjust = 1.4, 
            size = 5, 
            color = "white") +
  theme(axis.text.x = element_text(angle = 90,  hjust = 1),
        axis.ticks = element_blank(),
        axis.text.y = element_blank()) +
  themebar_facet +
   aes(fill = as.factor(CHC.status)) +
  scale_fill_manual(values = c("#00AFBB", "goldenrod4")) +
  labs(x = "Status (Yes/No) of chronic health condition",
       y = "Percent of respondents (%)",
       caption = "Data source: BRFSS2018")

Among people having MICHD

ggplot(CHC_MICHD, aes(CHC.status,ï..ChronicHLTH, 
                      fill = MICHD.YES)) +
  geom_tile(show.legend = TRUE) +
  geom_text(aes(label = paste0(MICHD.YES, "%")), color = "white") +
  labs(x = "Chronic Health Conditions Status",
    y = "Chronic Health Conditions",
    color = "Having MICHD") +
  theme(axis.text = element_text(size = 12, color = "black"),
        axis.title = element_text(size = 12, face="bold"),
        plot.caption = element_text(color = "grey", size = 12, face = "italic"))

Oral Health (H) H1. RMVTETH4 ==> RMVTETH H2. LASTDEN4 ===> LASTDEN

H1. RMVTETH4 ==> RMVTETH

ggplot(BRFSS, aes(x = MICHD , group = RMVTETH)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values = c("gainsboro", "seagreen4")) +
    labs(y = "Percent of respondents", fill = "MICHD") +
    facet_grid(~ RMVTETH) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$RMVTETH, BRFSS$MICHD), correct = FALSE)

## 
##  4-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$RMVTETH, BRFSS$MICHD)
## X-squared = 11997, df = 3, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4 
## 0.9553718 0.9040012 0.8193665 0.7600530

H2. LASTDEN4 ===> LASTDENTV

ggplot(BRFSS, aes(x= MICHD , group = LASTDENTV)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "seagreen4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ LASTDENTV) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$LASTDENTV, BRFSS$MICHD), correct=FALSE)

## 
##  4-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$LASTDENTV, BRFSS$MICHD)
## X-squared = 2054.9, df = 3, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4 
## 0.9248640 0.9072394 0.8948507 0.8479726

MICHDrvmteeth <- data.frame(
  teeth_group = c("0","1-5",">= 6", "All"),
  Percentage = c(4.5, 9.6, 18.1, 24))
MICHDrvmteeth

##   teeth_group Percentage
## 1           0        4.5
## 2         1-5        9.6
## 3        >= 6       18.1
## 4         All       24.0

positions <- c("0","1-5",">= 6", "All")

ggplot(data = MICHDrvmteeth ,
           aes(x = positions, 
               y  = Percentage)) +
  geom_bar(stat = "identity",
           fill = "seagreen4",
           width = 0.6)+
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_discrete(limits = positions) +
  scale_y_continuous(limits = c(0,25)) +
  theme_minimal() +
  labs(x = "Number of teeth were removed",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

MICHD_lastvdent <- data.frame(
  teeth_group = c("<1","1-<2","2-<5", "5+"),
  Percentage = c(7.5, 9.3, 10.5, 15.2))

positions <- c("<1","1-<2","2-<5", "5+")

ggplot(data = MICHD_lastvdent,
           aes(x = positions, 
               y  = Percentage)) +
  geom_bar(stat = "identity",
           fill = "seagreen4",
           width = 0.6) +
  geom_text(aes(label = paste0(Percentage,"%")), 
            vjust = -0.5, 
            color = "black", 
            size = 4.5) +
  scale_x_discrete(limits = positions) +
  scale_y_continuous(limits = c(0,20)) +
  theme_minimal() +
  labs(x = "Last time you visited dental clinic ",
       y = "Percentage (%)",
       caption = "Data source: BRFSS2018") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.caption = element_text(color = "grey44", size = 12, face = "italic"),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(size = 12),
        legend.position = "non")

MICAT (I) I1. X_BMI5CAT ===> BMICAT

ggplot(BRFSS, aes(x= MICHD , group = BMICAT)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "firebrick4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ BMICAT) +
    scale_y_continuous(labels = scales::percent) +
  themebar_facet

prop.test(table(BRFSS$BMICAT, BRFSS$MICHD), correct = FALSE)

## 
##  4-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$BMICAT, BRFSS$MICHD)
## X-squared = 1051.4, df = 3, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4 
## 0.9099751 0.9359832 0.9104626 0.8920471

COMMUNITY (J)

ggplot(BRFSS, aes(x= MICHD , group = COMMUN)) + 
  geombar_facet +
  geomtext_facet +
  scale_fill_manual(values= c("gainsboro", "cyan4")) +
    labs(y = "Percent of respondents", fill="MICHD") +
    facet_grid(~ COMMUN) +
    scale_y_continuous(labels = scales::percent)+
  themebar_facet

prop.test(table(BRFSS$COMMUN,BRFSS$MICHD), correct = FALSE)

## 
##  3-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(BRFSS$COMMUN, BRFSS$MICHD)
## X-squared = 335.84, df = 2, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.8954441 0.8995739 0.9186046