library(riskCommunicator)
library(tidyverse)
library(skimr)
library(knitr)
library(ggthemes)
library(patchwork)
data(framingham, package = "riskCommunicator")
glimpse(framingham)
## Rows: 11,627
## Columns: 39
## $ RANDID <dbl> 2448, 2448, 6238, 6238, 6238, 9428, 9428, 10552, 10552, 11252…
## $ SEX <dbl> 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
## $ TOTCHOL <dbl> 195, 209, 250, 260, 237, 245, 283, 225, 232, 285, 343, NA, 22…
## $ AGE <dbl> 39, 52, 46, 52, 58, 48, 54, 61, 67, 46, 51, 58, 43, 49, 55, 6…
## $ SYSBP <dbl> 106.0, 121.0, 121.0, 105.0, 108.0, 127.5, 141.0, 150.0, 183.0…
## $ DIABP <dbl> 70.0, 66.0, 81.0, 69.5, 66.0, 80.0, 89.0, 95.0, 109.0, 84.0, …
## $ CURSMOKE <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0…
## $ CIGPDAY <dbl> 0, 0, 0, 0, 0, 20, 30, 30, 20, 23, 30, 30, 0, 0, 0, 0, 0, 20,…
## $ BMI <dbl> 26.97, NA, 28.73, 29.43, 28.50, 25.34, 25.34, 28.58, 30.18, 2…
## $ DIABETES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ BPMEDS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ HEARTRTE <dbl> 80, 69, 95, 80, 80, 75, 75, 65, 60, 85, 90, 74, 77, 120, 86, …
## $ GLUCOSE <dbl> 77, 92, 76, 86, 71, 70, 87, 103, 89, 85, 72, NA, 99, 86, 81, …
## $ educ <dbl> 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3, 2, 2, 2, 1, 1, 2, 2, 2, 1…
## $ PREVCHD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ PREVAP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ PREVMI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PREVSTRK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PREVHYP <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1…
## $ TIME <dbl> 0, 4628, 0, 2156, 4344, 0, 2199, 0, 1977, 0, 2072, 4285, 0, 2…
## $ PERIOD <dbl> 1, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1…
## $ HDLC <dbl> NA, 31, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, 46, N…
## $ LDLC <dbl> NA, 178, NA, NA, 141, NA, NA, NA, NA, NA, NA, NA, NA, NA, 135…
## $ DEATH <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ANGINA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ HOSPMI <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MI_FCHD <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ ANYCHD <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0…
## $ STROKE <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CVD <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ HYPERTEN <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ TIMEAP <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEMI <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEMIFC <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMECHD <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMESTRK <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2089, 2089, 8766, 8…
## $ TIMECVD <dbl> 6438, 6438, 8766, 8766, 8766, 8766, 8766, 2089, 2089, 8766, 8…
## $ TIMEDTH <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 2956, 2956, 8766, 8…
## $ TIMEHYP <dbl> 8766, 8766, 8766, 8766, 8766, 8766, 8766, 0, 0, 4285, 4285, 4…
framinghamSub <-framingham %>% select(RANDID:DIABETES) %>%
mutate(SEX = case_when(SEX == 1 ~ "Male"
,SEX == 2 ~ "Female",
TRUE ~ as.character(NA)),
CURSMOKE = case_when(CURSMOKE == 0 ~ "No",
CURSMOKE == 1 ~ "Yes",
TRUE ~ as.character(NA)))
skim(framinghamSub)
| Name | framinghamSub |
| Number of rows | 11627 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| SEX | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| CURSMOKE | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| RANDID | 0 | 1.00 | 5004740.92 | 2900877.44 | 2448.00 | 2474378.00 | 5006008.00 | 7472730.00 | 9999312.0 | ▇▇▇▇▇ |
| TOTCHOL | 409 | 0.96 | 241.16 | 45.37 | 107.00 | 210.00 | 238.00 | 268.00 | 696.0 | ▅▇▁▁▁ |
| AGE | 0 | 1.00 | 54.79 | 9.56 | 32.00 | 48.00 | 54.00 | 62.00 | 81.0 | ▂▇▇▅▁ |
| SYSBP | 0 | 1.00 | 136.32 | 22.80 | 83.50 | 120.00 | 132.00 | 149.00 | 295.0 | ▆▇▁▁▁ |
| DIABP | 0 | 1.00 | 83.04 | 11.66 | 30.00 | 75.00 | 82.00 | 90.00 | 150.0 | ▁▅▇▁▁ |
| CIGPDAY | 79 | 0.99 | 8.25 | 12.19 | 0.00 | 0.00 | 0.00 | 20.00 | 90.0 | ▇▂▁▁▁ |
| BMI | 52 | 1.00 | 25.88 | 4.10 | 14.43 | 23.09 | 25.48 | 28.07 | 56.8 | ▃▇▁▁▁ |
| DIABETES | 0 | 1.00 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
framinghamSub %>% ggplot(aes(x = SYSBP,
y = DIABP)) +
geom_point(alpha = 0.2) +
facet_grid(.~SEX)
mapping cigarettes per day
scatter <- framinghamSub %>% ggplot(aes(x = SYSBP,
y = DIABP,
size = CIGPDAY,
color = SEX)) +
geom_point(alpha = 0.2) +
facet_grid(.~SEX) +
scale_color_colorblind() +
labs(title = "Systolic by diastolic blood pressure",
x = "Systolic BP (mg/dL",
y = "Diastolic BP (mg/dL)",
caption = "Data source: Framingham Heart STudy & the reiskCommunicator package" ) +
guides(color = FALSE) +
geom_smooth(se = FALSE, method = "lm", size = 1) +
theme_bw() +
theme(legend.position = "bottom")
next we create side by side box plots
ggplot(data = framinghamSub,
aes(x = CURSMOKE,
y = TOTCHOL,
fill = CURSMOKE)) +
geom_boxplot() +
facet_grid(.~SEX) +
scale_fill_viridis_d() +
labs(title = "Total cholesterol by smoking status",
x = "Curent smoker",
y = "Serum Total Cholesterol",
caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
theme_bw() +
theme(text = element_text(face = "bold"),
legend.position = "none",
axis.title = element_text(size = 14))
##line chart
linechart<-framinghamSub %>% ggplot() +
stat_summary(aes(x = AGE,
y = CIGPDAY,
group = SEX, color = SEX),
geom = "line", size = 1, fun.y = mean) +
labs(title = "Average cigarettes per day by age and sex",
x = "Age",
y = "Average cigarettes per day" ,
color = "Sex") +
scale_y_continuous(breaks = c(0,4,8,12,16)) +
scale_color_viridis_d() +
theme_bw()
now combining scatter and line plots with patchwork package
linechart / scatter
##faceted bar chart creating a new variable based on the intervals
defined by John Hopkins Hospital
framinghamSub <- framinghamSub %>%
mutate(CholesterolCat = case_when(TOTCHOL < 200 ~ "Normal",
TOTCHOL >= 200 & TOTCHOL < 240 ~ "Borderline high",
TOTCHOL > 240 ~ "High",
TRUE ~ as.character(NA)))
framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = CholesterolCat)) +
geom_bar()
Now we order the bars based on the ordering of the categories.
framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = fct_relevel(CholesterolCat, "Normal", "Borderline high", "High"), fill = CholesterolCat)) +
facet_grid(.~SEX) +
labs(title = "Distribution of cholestoerol levels",
x = "Cholesterol level",
y = "Count",
caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
scale_fill_viridis_d() +
geom_bar(color = "black") +
theme_bw() +
theme(legend.position = "none")
flip to a horizontal bar chart
framinghamSub %>% filter(AGE >= 40, !is.na(CholesterolCat)) %>% ggplot(aes(x = fct_relevel(CholesterolCat, "Normal", "Borderline high", "High"), fill = CholesterolCat)) +
facet_grid(.~SEX) +
labs(title = "Distribution of cholestoerol levels",
x = "Cholesterol level",
y = "Count",
caption = "Data source: Framingham Heart Study & the riskCommunicater package") +
scale_fill_viridis_d() +
geom_bar(color = "black") +
coord_flip() +
theme_bw() +
theme(legend.position = "none")