Reference lines
- Pass the yintercept argument to the geom_hline() to draw a horizontal reference line.
- “h” for horizontal.
# Adding a single line
ggplot(data=iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_hline(yintercept=4)

# Adding >1 line
ggplot(data=iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_hline(yintercept=c(2.5,5))

- Pass the xintercept argument to the geom_vline() to draw a vertical reference line.
- “v”" for vertical.
# Adding a single line
ggplot(data=iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_vline(xintercept=5.5)

# Adding >1 line
ggplot(data=iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_vline(xintercept=c(5.5,6.5))

- Some arguments that you may pass to customize your references lines are linetype, color, and size.
- List of linetypes may be found here
ggplot(data=iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_hline(yintercept=mean(iris$Petal.Length), linetype="dashed", color="red", size=0.5) +
geom_vline(xintercept=mean(iris$Sepal.Length), linetype="dashed", color="blue", size=0.5)

Outliers
- The statistical definition of outliers is any number beyond the range of Q1-(1.5*IQR) and Q3+(1.5*IQR).
- ggplot2 boxplots will display these outliers as individual data points.
- You may be curious to know the identity/values of these data points.
# Add ID column
iris2 <- cbind("ID"=c(1:dim(iris)[1]), iris)
head(iris2)
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1 5.1 3.5 1.4 0.2 setosa
## 2 2 4.9 3.0 1.4 0.2 setosa
## 3 3 4.7 3.2 1.3 0.2 setosa
## 4 4 4.6 3.1 1.5 0.2 setosa
## 5 5 5.0 3.6 1.4 0.2 setosa
## 6 6 5.4 3.9 1.7 0.4 setosa
# Plot boxplot
ggplot(data=iris2, aes(x=Species, y=Sepal.Width)) +
geom_boxplot()

- Notice that there are 4 outliers here and we wish to annotate each of them.
- Here, we will annotate with their respective ID’s.
- We will use the geom_text_repel() function from the ggrepel package to annotate these outliers.
# Create function to calculate lower boundary
lo <- function(x) {
summary(x)[2]-1.5*IQR(x)
}
# Create function to calculate upper boundary
up <- function(y) {
summary(y)[5]+1.5*IQR(y)
}
# Calculate lower boundary for each group
LO <- aggregate(iris2[, "Sepal.Width"], list(iris2$Species), lo)
LO
## Group.1 x
## 1 setosa 2.4875
## 2 versicolor 1.8125
## 3 virginica 2.2375
# Calculate upper boundary for each group
HI <- aggregate(iris2[, "Sepal.Width"], list(iris2$Species), up)
HI
## Group.1 x
## 1 setosa 4.3875
## 2 versicolor 3.7125
## 3 virginica 3.7375
# Annotate ID's of outliers for setosa species in data frame
setosa <- iris2[which(iris2$Species == "setosa"), ]
setosa$Labels <- ifelse( (setosa$Sepal.Width < LO[1,2] | setosa$Sepal.Width > HI[1,2]), setosa$ID, "")
# Annotate ID's of outliers for versicolor species in data frame
versicolor <- iris2[which(iris2$Species == "versicolor"), ]
versicolor$Labels <- ifelse( (versicolor$Sepal.Width < LO[2,2] | versicolor$Sepal.Width > HI[2,2]), versicolor$ID, "")
# Annotate ID's outliers for virginica species in data frame
virginica <- iris2[which(iris2$Species == "virginica"), ]
virginica$Labels <- ifelse( (virginica$Sepal.Width < LO[3,2] | virginica$Sepal.Width > HI[3,2]), virginica$ID, "")
# Combine data frames
merged <- rbind(setosa, versicolor, virginica)
# Order by rows as per original data frame
merged <- merged[order(merged$ID), ]
head(merged, n=20)
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species Labels
## 1 1 5.1 3.5 1.4 0.2 setosa
## 2 2 4.9 3.0 1.4 0.2 setosa
## 3 3 4.7 3.2 1.3 0.2 setosa
## 4 4 4.6 3.1 1.5 0.2 setosa
## 5 5 5.0 3.6 1.4 0.2 setosa
## 6 6 5.4 3.9 1.7 0.4 setosa
## 7 7 4.6 3.4 1.4 0.3 setosa
## 8 8 5.0 3.4 1.5 0.2 setosa
## 9 9 4.4 2.9 1.4 0.2 setosa
## 10 10 4.9 3.1 1.5 0.1 setosa
## 11 11 5.4 3.7 1.5 0.2 setosa
## 12 12 4.8 3.4 1.6 0.2 setosa
## 13 13 4.8 3.0 1.4 0.1 setosa
## 14 14 4.3 3.0 1.1 0.1 setosa
## 15 15 5.8 4.0 1.2 0.2 setosa
## 16 16 5.7 4.4 1.5 0.4 setosa 16
## 17 17 5.4 3.9 1.3 0.4 setosa
## 18 18 5.1 3.5 1.4 0.3 setosa
## 19 19 5.7 3.8 1.7 0.3 setosa
## 20 20 5.1 3.8 1.5 0.3 setosa
# (Finally) annotate outliers on plot
library(ggrepel)
ggplot(data=merged, aes(x=Species, y=Sepal.Width)) +
geom_boxplot() +
geom_text_repel(aes(label=Labels))

- One nice thing about the geom_text_repel() function is that it prevents overlaps in the labelling.
- Notice here that the virginica group has 2 outliers with the exact same value, i.e. 3.8. geom_text_repel() nicely prevents the overlapping of their labels.
- Now, instead of ID’s, you may want to annotate your outliers with the original values.
# Create a label column for outliers for setosa species in data frame
setosa <- iris2[which(iris2$Species == "setosa"), ]
setosa$Labels <- ifelse( (setosa$Sepal.Width < LO[1,2] | setosa$Sepal.Width > HI[1,2]), setosa$Sepal.Width, "")
# Create a label column for outliers for versicolor species in data frame
versicolor <- iris2[which(iris2$Species == "versicolor"), ]
versicolor$Labels <- ifelse( (versicolor$Sepal.Width < LO[2,2] | versicolor$Sepal.Width > HI[2,2]), versicolor$Sepal.Width, "")
# Create a label column for outliers for virginica species in data frame
virginica <- iris2[which(iris2$Species == "virginica"), ]
virginica$Labels <- ifelse( (virginica$Sepal.Width < LO[3,2] | virginica$Sepal.Width > HI[3,2]), virginica$Sepal.Width, "")
# Combine data frames
merged <- rbind(setosa, versicolor, virginica)
# Order by rows as per original data frame
merged <- merged[order(merged$ID), ]
head(merged, n=20)
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species Labels
## 1 1 5.1 3.5 1.4 0.2 setosa
## 2 2 4.9 3.0 1.4 0.2 setosa
## 3 3 4.7 3.2 1.3 0.2 setosa
## 4 4 4.6 3.1 1.5 0.2 setosa
## 5 5 5.0 3.6 1.4 0.2 setosa
## 6 6 5.4 3.9 1.7 0.4 setosa
## 7 7 4.6 3.4 1.4 0.3 setosa
## 8 8 5.0 3.4 1.5 0.2 setosa
## 9 9 4.4 2.9 1.4 0.2 setosa
## 10 10 4.9 3.1 1.5 0.1 setosa
## 11 11 5.4 3.7 1.5 0.2 setosa
## 12 12 4.8 3.4 1.6 0.2 setosa
## 13 13 4.8 3.0 1.4 0.1 setosa
## 14 14 4.3 3.0 1.1 0.1 setosa
## 15 15 5.8 4.0 1.2 0.2 setosa
## 16 16 5.7 4.4 1.5 0.4 setosa 4.4
## 17 17 5.4 3.9 1.3 0.4 setosa
## 18 18 5.1 3.5 1.4 0.3 setosa
## 19 19 5.7 3.8 1.7 0.3 setosa
## 20 20 5.1 3.8 1.5 0.3 setosa
# (Finally) annotate outliers on plot
ggplot(data=merged, aes(x=Species, y=Sepal.Width)) +
geom_boxplot() +
geom_text_repel(aes(label=Labels))

Specific points
- You may wish to annotate specific points that may not belong to specific groupings, e.g outliers, pre-defined groupings.
- This oftentimes happens when you have a scatterplot and you wish to bring your audience’s attention to one or a few specific data points of interest.
- The principle is very much similar to that discussed in annotating outliers in boxplot, i.e. creating a “Labels” columns.
# Retrieve ID's of interest
ID <- c(20, 40, 60, 80, 100)
# Create reference table of chosen data points
ref <- iris2[grep(paste("^", ID, "$" , sep="", collapse="|"), iris2$ID), ]
# Add a label column to identify these data points
ref$Labels <- ref$ID
ref
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 20 20 5.1 3.8 1.5 0.3 setosa
## 40 40 5.1 3.4 1.5 0.2 setosa
## 60 60 5.2 2.7 3.9 1.4 versicolor
## 80 80 5.7 2.6 3.5 1.0 versicolor
## 100 100 5.7 2.8 4.1 1.3 versicolor
## Labels
## 20 20
## 40 40
## 60 60
## 80 80
## 100 100
# Annotate original data frame with these labels
library(plyr)
iris2new <- join(iris2, ref[ ,c("ID", "Labels")], by="ID")
head(iris2new, n=20)
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species Labels
## 1 1 5.1 3.5 1.4 0.2 setosa NA
## 2 2 4.9 3.0 1.4 0.2 setosa NA
## 3 3 4.7 3.2 1.3 0.2 setosa NA
## 4 4 4.6 3.1 1.5 0.2 setosa NA
## 5 5 5.0 3.6 1.4 0.2 setosa NA
## 6 6 5.4 3.9 1.7 0.4 setosa NA
## 7 7 4.6 3.4 1.4 0.3 setosa NA
## 8 8 5.0 3.4 1.5 0.2 setosa NA
## 9 9 4.4 2.9 1.4 0.2 setosa NA
## 10 10 4.9 3.1 1.5 0.1 setosa NA
## 11 11 5.4 3.7 1.5 0.2 setosa NA
## 12 12 4.8 3.4 1.6 0.2 setosa NA
## 13 13 4.8 3.0 1.4 0.1 setosa NA
## 14 14 4.3 3.0 1.1 0.1 setosa NA
## 15 15 5.8 4.0 1.2 0.2 setosa NA
## 16 16 5.7 4.4 1.5 0.4 setosa NA
## 17 17 5.4 3.9 1.3 0.4 setosa NA
## 18 18 5.1 3.5 1.4 0.3 setosa NA
## 19 19 5.7 3.8 1.7 0.3 setosa NA
## 20 20 5.1 3.8 1.5 0.3 setosa 20
# Replace missing values with blanks
iris2new$Labels[is.na(iris2new$Labels)] <- ""
# (Finally) annotate outliers on plot
ggplot(data=iris2new, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_text_repel(aes(label=Labels))

- Here, it is not immediately clear which data points is being labelled.
- The geom_label_repel() function would be able to distinguish these points with their corresponding labels.
ggplot(data=iris2new, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point() +
geom_label_repel(aes(label=Labels))

- Notice how nicely the labels are surrounded by a box, which accentuates the labels.
- Furthermore, the labels nicely avoid obscuring the other data points on the scatterplot.
- The function even uses straight lines as an extension to refer to hard-to-reach data points in “crowded” areas.