Intermediate ggplot2

Learning Objectives

  • How to conduct exploratory data analysis

  • Use geom_point() and geom_smooth()

  • Use complete themes in ggplot2

  • Customize theme elements in ggplot2

  • Create and customize bar charts in ggplot2

  • Facet and combine plots with the patchwork package

Let’s load some R packages to start

library(riskCommunicator)
library(tidyverse)
library(skimr)
library(knitr)
library(ggthemes)
library(ggpubr)
library(patchwork)

Let’s load the FHS data set from the riskCommunicator package

data(framingham, package = "riskCommunicator")

Select the first 10 variables from the Framingham dataset and store it as a new data frame called framinghamSub using the select() function.

# Creating a subset of the framingham data
framinghamSub <- framingham |>
  dplyr::select(1:10)

Update the SEX variable to have the values “Male” and “Female” rather than 1 and 2.

# Creating a subset of the framingham data
framinghamSub <- framingham |>
  dplyr::select(1:10) |>
  mutate(newSex = case_when(SEX == 1 ~ "Male",
                            SEX == 2 ~ "Female",
                            TRUE ~ NA))

Update the CURSMOKE variable to have the values “Yes” and “No” rather than 1 and 0 using the mutate() and case_when() functions. This should be your new data set to be used for the rest of the assignment.

# Creating a subset of the framingham data
framinghamSub <- framingham |>
  dplyr::select(1:10) |>
  mutate(newSex = case_when(SEX == 1 ~ "Male",
                            SEX == 2 ~ "Female",
                            TRUE ~ "Other"),
         newCurrentSmoker = case_when(CURSMOKE == 1 ~ "Yes",
                            CURSMOKE == 0 ~ "No",
                            TRUE ~ "Other"))

Use the skim() function from the skimr package to explore other characteristics of the subset of the data.

# Displaying high-level characteristics
skim(framinghamSub)
Data summary
Name framinghamSub
Number of rows 11627
Number of columns 12
_______________________
Column type frequency:
character 2
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
newSex 0 1 4 6 0 2 0
newCurrentSmoker 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
RANDID 0 1.00 5004740.92 2900877.44 2448.00 2474378.00 5006008.00 7472730.00 9999312.0 ▇▇▇▇▇
SEX 0 1.00 1.57 0.50 1.00 1.00 2.00 2.00 2.0 ▆▁▁▁▇
TOTCHOL 409 0.96 241.16 45.37 107.00 210.00 238.00 268.00 696.0 ▅▇▁▁▁
AGE 0 1.00 54.79 9.56 32.00 48.00 54.00 62.00 81.0 ▂▇▇▅▁
SYSBP 0 1.00 136.32 22.80 83.50 120.00 132.00 149.00 295.0 ▆▇▁▁▁
DIABP 0 1.00 83.04 11.66 30.00 75.00 82.00 90.00 150.0 ▁▅▇▁▁
CURSMOKE 0 1.00 0.43 0.50 0.00 0.00 0.00 1.00 1.0 ▇▁▁▁▆
CIGPDAY 79 0.99 8.25 12.19 0.00 0.00 0.00 20.00 90.0 ▇▂▁▁▁
BMI 52 1.00 25.88 4.10 14.43 23.09 25.48 28.07 56.8 ▃▇▁▁▁
DIABETES 0 1.00 0.05 0.21 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁

Set a default ggplot theme for the entire document to a complete theme of your choice from the ggplot2 package or the ggthemes package.

# Setting a global ggplot2 theme for the entire document
theme_set(ggthemes::theme_few())

Scatter plot

Make a scatter plot between diastolic (DIABP) and systolic (SYSBP) blood pressure with a “facet” by the sex of the participant (SEX). Also, manually set the alpha aesthetic to be 0.2.

# Creating a scatter plot
framinghamSub |> 
  ggplot(aes(x = SYSBP, y = DIABP)) +
  geom_point() +
  facet_grid(. ~ newSex) +
  labs(title = "Diastolic by systolic blood-pressure",
       y = "Diastolic BP (mmHg)",
       x = "Systolic BP (mmHg)",
       caption = "Data source: Framingham Heart Study")

Manually set alpha aesthetic to be 0.2

# Creating a scatter plot
framinghamSub |> 
  ggplot(aes(x = SYSBP, y = DIABP)) +
  geom_point(alpha =0.2) +
  facet_grid(. ~ newSex) +
  labs(title = "Diastolic by systolic blood-pressure",
       y = "Diastolic BP (mmHg)",
       x = "Systolic BP (mmHg)",
       caption = "Data source: Framingham Heart Study")

Data points size, coloring and legend customization

Also include the size of the data points as mapped by the number of cigarettes smoked per day (CIGPDAY), add a color-blind friendly palette for coloring the points, and position the legend at the bottom of the plot. Add a guides(color = 'none') layer to suppress the legend since it is redundant.

# Creating a scatter plot
framinghamSub |> 
  ggplot(aes(x = SYSBP, y = DIABP,
             size = CIGPDAY,
           color = newSex)) +
  geom_point(alpha =0.2) +
  scale_color_colorblind() +
  facet_grid(. ~ newSex) +
  labs(title = "Diastolic by systolic blood-pressure",
       y = "Diastolic BP (mmHg)",
       x = "Systolic BP (mmHg)",
       size = "Cigarates smoked per day",
       caption = "Data source: Framingham Heart Study") +
  guides(color = "none") +
  theme(legend.position = "bottom") 

Simple linear regression line

Add a line of best fit corresponding to a simple linear regression model fit separately for males and females using geom_smooth().

# Creating a scatter plot
framinghamSub |> 
  ggplot(aes(x = SYSBP, y = DIABP,
             size = CIGPDAY,
           color = newSex)) +
  geom_point(alpha =0.2) + 
  geom_smooth(method = "lm", color = "dodgerblue", se = FALSE, size = 1) + 
  scale_color_colorblind() +
  facet_grid(. ~ newSex) +
  labs(title = "Diastolic by systolic blood-pressure",
       y = "Diastolic BP (mmHg)",
       x = "Systolic BP (mmHg)",
       size = "Cigarates smoked per day",
       caption = "Data source: Framingham Heart Study") +
  guides(color = "none") +
  theme(legend.position = "bottom") 

Adding the regression equations

Add the estimated regression equations to each subplot using the ggpubr package and adding a stat_regline_equation(label.x = 210, label.y = 40, size = 3.2) layer.

# Creating a scatter plot
my_scatter <- framinghamSub |> 
  ggplot(aes(x = SYSBP, y = DIABP,
             size = CIGPDAY,
           color = newSex)) +
  geom_point(alpha =0.2) + 
  geom_smooth(method = "lm", color = "dodgerblue", se = FALSE, size = 1) + 
  scale_color_colorblind() +
  facet_grid(. ~ newSex) +
  labs(title = "Diastolic by systolic blood-pressure",
       y = "Diastolic BP (mmHg)",
       x = "Systolic BP (mmHg)",
       size = "Cigarates smoked per day",
       caption = "Data source: Framingham Heart Study") +
  guides(color = "none") +
  stat_regline_equation(label.x = 210, label.y = 40, size = 2.5) +
  theme(legend.position = "bottom") 
my_scatter

Side-by-side box-plot

Next, create a side-by-side box-plot where the y-axis is total cholesterol (TOTCHOL) and the x-axis is current smoking status (CURSMOKE). Increase the font size for the axes and title text in the plot.

# Creating side by side Boxplots.
framinghamSub |>
  ggplot(aes(x = newCurrentSmoker,
             y = TOTCHOL)) +
  geom_boxplot() +
  labs(title = "Total Cholestrol by Smoking Status",
       x = "Current Smoker",
       y = "Serum total cholestrol (mm/dL)",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  theme(plot.title = element_text(family = "", color = "blue", size = 16))

Color the boxes, customize legend, title and axis titles

Color the boxes based on smoking status by manually specifying the colors to be “mediumseagreen” and “royalblue”, remove the legend, and make the title and axis titles bold.

# Creating side by side Boxplots.
framinghamSub |>
  ggplot(aes(x = newCurrentSmoker,
             y = TOTCHOL,
             fill = newCurrentSmoker)) +
  geom_boxplot() +
  scale_fill_manual(values = c("mediumseagreen",
                               "royalblue")) +
  labs(title = "Total Cholestrol by Smoking Status",
       x = "Current Smoker",
       y = "Serum total cholestrol (mm/dL)",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  theme(plot.title = element_text(family = "", color = "blue", size = 16)) +
  # theme(base_size = 12)
  theme(legend.position = "none",
        title = element_text(face = "bold"))

Facet box plots by the sex

In a new plot, modify the side-by-side box-plots we created to be faceted by the sex of the participant using the facet_grid() function and columns to break up the subplots.

# Creating side by side Boxplots.
framinghamSub |>
  ggplot(aes(x = newCurrentSmoker,
             y = TOTCHOL,
             fill = newCurrentSmoker)) +
  geom_boxplot() +
  scale_fill_manual(values = c("mediumseagreen",
                               "royalblue")) +
  labs(title = "Total Cholestrol by Smoking Status",
       x = "Current Smoker",
       y = "Serum total cholestrol (mm/dL)",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  facet_grid(. ~ newSex) +
  theme(plot.title = element_text(family = "", color = "blue", size = 16)) +
  # theme(base_size = 12)
  theme(legend.position = "none",
        title = element_text(face = "bold"))

Average cigarettes per day by age for each sex graph

Make a line graph that shows the average cigarettes per day (CIGPDAY) by age (AGE), with separate lines by the sex of the participant (SEX).

my_linechart <- framinghamSub |>
  ggplot(aes(x = AGE,
             y =CIGPDAY,
             group = newSex,
             color = newSex)) +
  stat_summary(fun.y = mean,
               geom =  "line",
               size = 1) + 
  scale_y_continuous(breaks = c(0, 4, 8, 12, 16)) + # seq(0, 20, by = 4) (sequence) another way of doing it!
  scale_color_colorblind() +
  labs(title = "Average Cigarettes Per Day by Age and Sex",
       x = "Age(years)",
       y = "Average number \n of cigarettes per day",
       color = "Sex")
my_linechart

patchwork for combining plots

my_linechart/my_scatter

# beepr::beep(sound = "shotgun")

Faceted bar chart

framinghamSub <- framinghamSub |> 
  mutate(CholesterolCat = case_when(TOTCHOL < 200 ~ "Normal",
                                    TOTCHOL >= 200 &  TOTCHOL < 240 ~ "Borderline high",
                                    TOTCHOL > 240 ~ "High",
                         TRUE ~ NA))

Creating a bar chart

Create a bar chart displaying the number of participants falling in each cholesterol category based on Johns Hopkins’ definitions using geom_bar(). Also, remove people under 40 and those without recorded cholesterol levels (missing values for CholesterolCat) from the plot by using the code filter(AGE >= 40, !is.na(CholesterolCat)) when piping the data into each subsequent ggplot() call.

framinghamSub |>
  filter(AGE >= 40, !is.na(CholesterolCat)) |>
  ggplot(aes(x = CholesterolCat,
             fill = CholesterolCat)) +
  geom_bar(color = "black") +
  scale_fill_viridis_d() +
  labs(title = "Distribution of Cholestrol Levels",
       y = "Count",
       x = "Cholestrol Level",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  facet_grid(. ~ newSex) +
  theme(legend.position = "none")

Recreate the bar chart, this time reordering the categories to show Normal, Borderline high, and then High from left to right using the fct_relevel() function.

framinghamSub |>
  filter(AGE >= 40, !is.na(CholesterolCat)) |>
  ggplot(aes(x = fct_relevel(CholesterolCat, 
                             "Normal", "Borderline high", "High"),
             fill = CholesterolCat)) +
  geom_bar(color = "black") +
  scale_fill_viridis_d() +
  labs(title = "Distribution of Cholestrol Levels",
       y = "Count",
       x = "Cholestrol Level",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  facet_grid(. ~ newSex) +
  theme(legend.position = "none")

Remove the extra space between the bars and the horizontal axis. Use scale_y_continous

framinghamSub |>
  filter(AGE >= 40, !is.na(CholesterolCat)) |>
  ggplot(aes(x = fct_relevel(CholesterolCat, 
                             "Normal", "Borderline high", "High"),
             fill = CholesterolCat)) +
  geom_bar(color = "black") +
  scale_fill_viridis_d() +
  labs(title = "Distribution of Cholestrol Levels",
       y = "Count",
       x = "Cholestrol Level",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  facet_grid(. ~ newSex) +
  theme(legend.position = "none")

Lastly, use the coord_flip() function to turn the bar chart into a horizontal bar chart instead.

framinghamSub |>
  filter(AGE >= 40, !is.na(CholesterolCat)) |>
  ggplot(aes(x = fct_relevel(CholesterolCat, 
                             "Normal", "Borderline high", "High"),
             fill = CholesterolCat)) +
  geom_bar(color = "black") +
  scale_fill_viridis_d() +
  labs(title = "Distribution of Cholestrol Levels",
       y = "Count",
       x = "Cholestrol Level",
       caption = "Data source: Framingham Heart Study and the riskCommunicator R pakage") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  facet_grid(. ~ newSex) +
  coord_flip() +
  theme(legend.position = "none")

Create a line chart showing the median BMI by age for smokers and non-smokers for those between the ages of 40 and 70, inclusive. Hint: use a scale_color_manual() layer to match the colors.

framinghamSub |>
  filter(AGE >= 40, AGE <= 70, !is.na(BMI)) |>
  ggplot(aes(x = AGE,
             y =BMI,
             group = newCurrentSmoker,
             color = newCurrentSmoker)) +
  stat_summary(fun.y = median,
               geom =  "line",
               size = 1) + 
  scale_y_continuous(breaks = seq(20, 40, by = 0.5)) + # seq(0, 20, by = 4) (sequence) another way of doing it!
  scale_color_colorblind() +
  labs(title = "Median BMI by Age and Smoking Status",
       x = "Age(years)",
       y = "Median BMI",
       color = "newCurrentSmoker") + 
  scale_color_manual(values = c("mediumseagreen",
                               "royalblue")) +
  theme(legend.position = "right")