library(riskCommunicator)
library(tidyverse)
library(skimr)
library(knitr)
library(ggthemes)
library(patchwork)
data(framingham, package = "riskCommunicator")

glimpse(framingham)
## Rows: 11,627
## Columns: 39
## $ RANDID   <dbl> 2448, 2448, 6238, 6238, 6238, 9428, 9428, 10552, 10552, 11252…
## $ SEX      <dbl> 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
## $ TOTCHOL  <dbl> 195, 209, 250, 260, 237, 245, 283, 225, 232, 285, 343, NA, 22…
## $ AGE      <dbl> 39, 52, 46, 52, 58, 48, 54, 61, 67, 46, 51, 58, 43, 49, 55, 6…
## $ SYSBP    <dbl> 106.0, 121.0, 121.0, 105.0, 108.0, 127.5, 141.0, 150.0, 183.0…
## $ DIABP    <dbl> 70.0, 66.0, 81.0, 69.5, 66.0, 80.0, 89.0, 95.0, 109.0, 84.0, …
## $ CURSMOKE <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0…
## $ CIGPDAY  <dbl> 0, 0, 0, 0, 0, 20, 30, 30, 20, 23, 30, 30, 0, 0, 0, 0, 0, 20,…
## $ BMI      <dbl> 26.97, NA, 28.73, 29.43, 28.50, 25.34, 25.34, 28.58, 30.18, 2…
## $ DIABETES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ BPMEDS   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ HEARTRTE <dbl> 80, 69, 95, 80, 80, 75, 75, 65, 60, 85, 90, 74, 77, 120, 86, …
## $ GLUCOSE  <dbl> 77, 92, 76, 86, 71, 70, 87, 103, 89, 85, 72, NA, 99, 86, 81, …
## $ educ     <dbl> 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3, 2, 2, 2, 1, 1, 2, 2, 2, 1…
## $ PREVCHD  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ PREVAP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ PREVMI   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PREVSTRK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PREVHYP  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1…
## $ TIME     <dbl> 0, 4628, 0, 2156, 4344, 0, 2199, 0, 1977, 0, 2072, 4285, 0, 2…
## $ PERIOD   <dbl> 1, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1…
## $ HDLC     <dbl> NA, 31, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, 46, N…
## $ LDLC     <dbl> NA, 178, NA, NA, 141, NA, NA, NA, NA, NA, NA, NA, NA, NA, 135…
## $ DEATH    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ANGINA   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ HOSPMI   <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MI_FCHD  <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ ANYCHD   <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0…
## $ STROKE   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CVD      <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ HYPERTEN <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ TIMEAP   <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEMI   <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEMIFC <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMECHD  <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMESTRK <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2089, 2089, 8766, 8…
## $ TIMECVD  <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2089, 2089, 8766, 8…
## $ TIMEDTH  <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEHYP  <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 0, 0, 4285, 4285, 4…
framinghamSub <-framingham %>% select(RANDID:DIABETES) %>% 
  mutate(SEX = case_when(SEX == 1 ~ "Male"
                         ,SEX == 2 ~ "Female",
                         TRUE ~ as.character(NA)),
         CURSMOKE = case_when(CURSMOKE == 0 ~ "No",
                              CURSMOKE == 1 ~ "Yes",
                              TRUE ~ as.character(NA)))
skim(framinghamSub)
Data summary
Name framinghamSub
Number of rows 11627
Number of columns 10
_______________________
Column type frequency:
character 2
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SEX 0 1 4 6 0 2 0
CURSMOKE 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
RANDID 0 1.00 5004740.92 2900877.44 2448.00 2474378.00 5006008.00 7472730.00 9999312.0 ▇▇▇▇▇
TOTCHOL 409 0.96 241.16 45.37 107.00 210.00 238.00 268.00 696.0 ▅▇▁▁▁
AGE 0 1.00 54.79 9.56 32.00 48.00 54.00 62.00 81.0 ▂▇▇▅▁
SYSBP 0 1.00 136.32 22.80 83.50 120.00 132.00 149.00 295.0 ▆▇▁▁▁
DIABP 0 1.00 83.04 11.66 30.00 75.00 82.00 90.00 150.0 ▁▅▇▁▁
CIGPDAY 79 0.99 8.25 12.19 0.00 0.00 0.00 20.00 90.0 ▇▂▁▁▁
BMI 52 1.00 25.88 4.10 14.43 23.09 25.48 28.07 56.8 ▃▇▁▁▁
DIABETES 0 1.00 0.05 0.21 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
framinghamSub %>% ggplot(aes(x = SYSBP,
                            y = DIABP)) + 
  geom_point(alpha = 0.2) + 
  facet_grid(.~SEX)

mapping cigarettes per day

scatter <- framinghamSub %>% ggplot(aes(x = SYSBP,
                            y = DIABP,
                            size = CIGPDAY,
                            color = SEX)) + 
  geom_point(alpha = 0.2) + 
  facet_grid(.~SEX) + 
  scale_color_colorblind() + 
  labs(title = "Systolic by diastolic blood pressure",
       x = "Systolic BP (mg/dL",
      y = "Diastolic BP (mg/dL)",
      caption = "Data source: Framingham Heart STudy & the reiskCommunicator package" ) + 
  guides(color = FALSE) +
  geom_smooth(se = FALSE, method = "lm", size = 1) +
  theme_bw() +
  theme(legend.position = "bottom") 

side by side Box plot

next we create side by side box plots

ggplot(data = framinghamSub,
       aes(x = CURSMOKE,
           y = TOTCHOL,
           fill = CURSMOKE)) + 
  geom_boxplot() + 
  facet_grid(.~SEX) +
  scale_fill_viridis_d() +
  labs(title = "Total cholesterol by smoking status",
       x = "Curent smoker",
       y = "Serum Total Cholesterol",
       caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
  theme_bw() +
  theme(text = element_text(face = "bold"), 
        legend.position = "none",
        axis.title = element_text(size = 14)) 

##line chart

linechart<-framinghamSub %>% ggplot() +
  stat_summary(aes(x = AGE,
                   y = CIGPDAY,
               group = SEX, color = SEX),
               geom = "line", size = 1, fun.y = mean) +
  labs(title = "Average cigarettes per day by age and sex",
       x = "Age",
       y = "Average cigarettes per day" ,
       color = "Sex") + 
  scale_y_continuous(breaks = c(0,4,8,12,16)) +
  scale_color_viridis_d() + 
  theme_bw() 

combine plots

now combining scatter and line plots with patchwork package

linechart / scatter

##faceted bar chart creating a new variable based on the intervals defined by John Hopkins Hospital

framinghamSub <- framinghamSub %>% 
  mutate(CholesterolCat = case_when(TOTCHOL < 200 ~ "Normal",
                                    TOTCHOL >= 200 &  TOTCHOL < 240 ~ "Borderline high",
                                    TOTCHOL > 240 ~ "High",
                         TRUE ~ as.character(NA)))
framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = CholesterolCat)) + 
  geom_bar() 

Now we order the bars based on the ordering of the categories.

framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = fct_relevel(CholesterolCat, "Normal", "Borderline high", "High"), fill = CholesterolCat)) + 
  facet_grid(.~SEX) +
  labs(title = "Distribution of cholestoerol levels",
       x = "Cholesterol level",
       y = "Count",
       caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
  scale_fill_viridis_d() + 
  geom_bar(color = "black") +
  theme_bw() +
  theme(legend.position = "none")

flip to a horizontal bar chart

framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = fct_relevel(CholesterolCat, "Normal", "Borderline high", "High"), fill = CholesterolCat)) + 
  facet_grid(.~SEX) +
  labs(title = "Distribution of cholestoerol levels",
       x = "Cholesterol level",
       y = "Count",
       caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
  scale_fill_viridis_d() + 
  geom_bar(color = "black") +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none")