Enhanced Data Visualization in R Using ggplot2

Ronald Wesonga (Ph.D)

25 August 2022

Introduction

The ggplot2 package

What is a grammar of graphics?

Elements of grammar of graphics

Uganda NCD Baseline Data

See for details about the Uganda National Baseline Surve Data:

ncdUG <- read.csv("~/Documents/EASI/Workshop/2022/SSP/ncdUG2014.csv")
attach(ncdUG)

ncdUG$residence <- factor(residence, levels = c("Urban", "Rural"), labels = c("Urban", "Rural"))
ncdUG$hypertension <- factor(hypertension, levels = c("Normal Blood Pressure", "Hypertension or Taking Medication for Hypertension"), labels = c("Normal", "Hypertensive"))
ncdUG$cvd <- factor(cvd, levels = c("NO CVD", "CVD"), labels = c("Normal", "CVD"))
ncdUG$mstatus <- factor(mstatus, levels = c("Single", "Married", "Separated/Divorced", "Widowed"), labels = c("Single", "Married", "Divorced", "Widowed"))
ncdUG$heduc <- factor(heduc, levels = c("none", "primary", "secondary", "university+"), labels = c("None", "Primary", "Secondary", "University"))
ncdUG$diabetes <- factor(diabetes, levels = c("blood glocuse < 6.1", "blood glocuse >=6.1 AND < 7.1", "blood glocuse >=7.1 or took meds today"), labels = c("Normal", "Prediabatic", "Diabetic"))
ncdUG$smoke <- factor(smoke, levels = c(0, 1), labels = c("None", "Smoker"))
ncdUG$hregion <- factor(hregion, levels = c("Northern", "Eastern", "Central", "Western"), labels = c("Northern", "Eastern", "Central", "Western"))

attach(ncdUG)
## The following objects are masked from ncdUG (pos = 3):
## 
##     age, bmi, cvd, diabetes, diastolic, fvservings, heduc, hhsize,
##     hregion, hypertension, mincome, mstatus, residence, smoke, systolic

The ggplot() function and aesthetics

ggplot(ncdUG, aes(x=diastolic, y=systolic)) +
  geom_point()
## Warning: Removed 81 rows containing missing values (geom_point).

Layers and overriding aesthetics

ggplot(ncdUG, aes(x=diastolic, y=systolic)) 

# scatter plot of volume vs sales
# with rug plot coloured by median sale price
ggplot(ncdUG, aes(x=diastolic, y=systolic)) +     # x=volume and y=sales inherited by all layers  
  geom_point() +
  geom_rug(aes(color=age))   # color will only apply to the rug plot because not specified in ggplot()
## Warning: Removed 81 rows containing missing values (geom_point).

Aesthetics

Mapping vs setting

Map aesthetics to variables inside the aes() function. By mapping, we mean the aesthetic will vary as the variable varies. For example, mapping x=time causes the position of the plotted data to vary with values of variable “time”. Similary, mapping color=group causes the color of objects to vary with values of variable “group”.

# mapping color to median inside of aes()
ggplot(ncdUG, aes(x=diastolic, y=systolic)) +
  geom_point(aes(color=age))
## Warning: Removed 81 rows containing missing values (geom_point).

# setting color to green outside of aes()
ggplot(ncdUG, aes(x=diastolic, y=systolic)) +
  geom_point(color="green")
## Warning: Removed 81 rows containing missing values (geom_point).

Geoms

Geoms and aesthetics

Histogram

ggplot(ncdUG, aes(x=age)) +
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Density Plots

ggplot(ncdUG, aes(x=age)) + 
  geom_density() 

ggplot(ncdUG, aes(x=age, color=mstatus)) + 
  geom_density() 

Boxplots

ggplot(ncdUG, aes(x=mstatus, y=age)) + 
  geom_boxplot()

Bar Plots

ggplot(ncdUG, aes(x=heduc)) + 
  geom_bar() 

ggplot(ncdUG, aes(x=heduc, fill=hregion)) + 
  geom_bar()

Scatter Plot

# scatter of diastolic vs systolic
ggplot(ncdUG, aes(x=diastolic, y=systolic)) + 
  geom_point() 
## Warning: Removed 81 rows containing missing values (geom_point).

ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age, alpha=fvservings, size=bmi)) + 
  geom_point()   
## Warning: Removed 298 rows containing missing values (geom_point).

Line graphs

ggplot(ncdUG, aes(x=fvservings, y=diastolic, group=diabetes)) + 
  geom_line() 
## Warning: Removed 13 row(s) containing missing values (geom_path).

ggplot(ncdUG, aes(x=fvservings, y=diastolic)) + 
  geom_line() 
## Warning: Removed 12 row(s) containing missing values (geom_path).

ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=diabetes)) + 
  geom_line()   
## Warning: Removed 13 row(s) containing missing values (geom_path).

Statistics

# summarize diastolic (y) for each fvserving (x)
ggplot(ncdUG, aes(x=fvservings, y=diastolic)) + 
  stat_summary()
## Warning: Removed 91 rows containing non-finite values (stat_summary).
## No summary function supplied, defaulting to `mean_se()`
## Warning: Removed 23 rows containing missing values (geom_segment).

Scales

# ggplot2 color choice
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point()
## Warning: Removed 91 rows containing missing values (geom_point).

# use scale_colour_manual() to specify which colors we want to use
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  scale_color_manual(values=c("red", "yellow", "green", "blue")) 
## Warning: Removed 91 rows containing missing values (geom_point).

Scale functions for the axes

ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  scale_color_manual(values=c("red", "orange", "green", "blue")) 
## Warning: Removed 91 rows containing missing values (geom_point).

# put tick marks at all grid lines along the y-axis using the breaks argument of scale_y_continuous  
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  scale_color_manual(values=c("red", "orange", "green", "blue")) + 
  scale_y_continuous(breaks=c(40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200))
## Warning: Removed 91 rows containing missing values (geom_point).

# relabel the tick marks to reflect units of thousands (of dollars) using labels
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  scale_color_manual(values=c("red", "orange", "green", "blue")) + 
  scale_y_continuous(breaks=c(40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200),
                     labels=c(4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))
## Warning: Removed 91 rows containing missing values (geom_point).

# retitle the y-axis using the name argument to reflect the units
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  scale_color_manual(values=c("red", "orange", "green", "blue")) + 
  scale_y_continuous(breaks=c(40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200),
                     labels=c(4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20),
                     name="Diastolic(Tens of Units)")
## Warning: Removed 91 rows containing missing values (geom_point).

Modifying axis limits and titles

# To set axis limits, supply a vector of 2 numbers (inside c(), for example) to one of the limits functions
ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  xlim(c(0,30)) # cut ranges from 0 to 5 in the data
## Warning: Removed 91 rows containing missing values (geom_point).

# use labs() to specify an overall titles for the overall graph, the axes, and legends (guides).

ggplot(ncdUG, aes(x=fvservings, y=diastolic, color=heduc)) + 
  geom_point() +
  labs(x="FV Servings", y="Diastolic", color="Education", title="Diastolic vs FV servings by Education")
## Warning: Removed 91 rows containing missing values (geom_point).

Guides visualize scales

# use guides() to remove the color scale legend:
# notice no legend on the right anymore
ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  guides(color="none")
## Warning: Removed 81 rows containing missing values (geom_point).

Coordinate systems

Coordinate systems define the planes on which objects are positioned in space on the plot. Most plots use Cartesian coordinate systems, as do all the plots in the seminar. Nevertheless, ggplot2 provides multiple coordinate systems, including polar, flipped Cartesian and map projections.

Faceting (paneling)

ggplot(ncdUG, aes(x=fvservings, y=diastolic)) + 
  geom_point() + 
  facet_wrap(~heduc) # create a ribbon of plots using heduc
## Warning: Removed 91 rows containing missing values (geom_point).

Themes

Themes control elements of the graph not related to the data. For example:

background color
size of fonts
gridlines
color of labels

To modify these, we use the theme() function, which has a large number of arguments called theme elements, which control various non-data elements of the graph.

Some example theme() arguments and what aspect of the graph they control:

axis.line : lines forming x-axis and y-axis
axis.line.x: just the line for x-axis
legend.position: positioning of the legend on the graph
panel.background: the background of the graph
panel.border: the border around the graph
title: all titles on the graph

Specifying theme() Elements

# the x- and y-axes are lines and are both controlled by theme() argument axis.line, so their visual properties, such as color and size (thickness), are specified as arguments to element_line():

ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme(axis.line=element_line(color="black", size=2)) # size in mm
## Warning: Removed 81 rows containing missing values (geom_point).

# the background of the graph, controlled by theme() argument panel.background is a rectangle, so parameters like fill color and border color can be specified element_rect().

ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme(axis.line=element_line(color="black", size=2),
        panel.background=element_rect(fill="white", color="gray")) # color is the border color
## Warning: Removed 81 rows containing missing values (geom_point).

# With element_text() we can control properties such as the font family or face ("bold", "italic", "bold.italic") of text elements like title, which controls the titles of both axes.
ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme(axis.line=element_line(color="black", size=2),
        panel.background=element_rect(fill="white", color="gray"),
        title=element_text(family="serif", face="bold")) 
## Warning: Removed 81 rows containing missing values (geom_point).

# some theme() arguments do not use element_ functions to control their properties, like legend.position, which simply accepts values "none", "left", "right", "bottom", and "top".
ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme(axis.line=element_line(color="black", size=2),
        panel.background=element_rect(fill="white", color="gray"),
        title=element_text(family="serif", face="bold"),
        legend.position="bottom") 
## Warning: Removed 81 rows containing missing values (geom_point).

Changing the overall look with complete themes

The ggplot2 package provides a few complete themes which make several changes to the overall background look of the graphic (see here for a full description). Examples: - theme_bw() - theme_light() - theme_dark() - theme_classic()

The themes usually adjust the color of the background and most of the lines that make up the non-data portion of the graph.

# theme_classic() mimics the look of base R graphics:

ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme_classic()
## Warning: Removed 81 rows containing missing values (geom_point).

# theme_dark() makes a dramatic change to the look:

ggplot(ncdUG, aes(x=diastolic, y=systolic, color=age)) + 
  geom_point() +
  theme_dark()
## Warning: Removed 81 rows containing missing values (geom_point).

Saving plots to files

# save last displayed plot as pdf
ggsave("plot.pdf")
## Saving 8 x 6 in image
## Warning: Removed 81 rows containing missing values (geom_point).
# if you're working with lots of graphs, you can store them in R objects
p <- ggplot(ncdUG, aes(x=diastolic, y=systolic)) + 
  geom_point()
# You can then use the plot argument of ggsave() to specify which plot to save instead of the last
ggsave("diasys.png", plot=p)
## Saving 8 x 6 in image
## Warning: Removed 81 rows containing missing values (geom_point).