Overlaying Plots

library(ggplot2)

Histogram + density curve

  • We will be using the Wage dataset from ISLR package for this section.
  • Combine geom_histogram() and geom_line() functions to overlay a density curve on a histogram.
# Load dataset
library(ISLR)
data(Wage)
class(Wage); dim(Wage);str(Wage)
## [1] "data.frame"
## [1] 3000   11
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...
# Create density distribution for variable to be plotted
age2 <- dnorm(Wage$age, mean(Wage$age), sd(Wage$age))

# Plot
ggplot(data=Wage) +
  geom_histogram(mapping=aes(x=age, y=..density..)) +
  geom_line(mapping=aes(x=age, y=age2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Add some aesthetics
ggplot(data=Wage) +
  geom_histogram(mapping=aes(x=age, y=..density..), fill="blue", color="black") +
  geom_line(mapping=aes(x=age, y=age2), color="red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot + dotplot

  • We will be creating our own dataset for this section.
  • Combine geom_boxplot() and geom_jitter() functions to overlay a density curve on a histogram.
  • Pass the height and width arguments to the geom_jitter() function to adjust the height and width between the data points.
# Create dataset
col1 <- sample(toupper(letters[1:3]), size=30, replace=TRUE)
col2 <- rnorm(n=30, mean=10, sd=2)
df2 <- data.frame("Group"=col1, "Value"=col2)
head(df2)
##   Group     Value
## 1     C 10.115253
## 2     C 11.455424
## 3     C 10.837462
## 4     C 10.616167
## 5     A 13.835783
## 6     C  9.290844
# Plot
ggplot(data=df2) +
  geom_boxplot(mapping=aes(x=Group, y=Value)) +
  geom_jitter(mapping=aes(x=Group, y=Value))

# Make the data points more compact
ggplot(data=df2) +
  geom_boxplot(mapping=aes(x=Group, y=Value)) +
  geom_jitter(mapping=aes(x=Group, y=Value), height=0.1, width=0.1)

# Add some aesthetics
ggplot(data=df2) +
  geom_boxplot(mapping=aes(x=Group, y=Value), fill="blue", outlier.colour="white") +
  geom_jitter(mapping=aes(x=Group, y=Value), height=0.1, width=0.1, color="red")

Scatterplot + smooth curve

  • We will be using the iris dataset from base R for this section.
  • Combine geom_point() and geom_smooth() functions to overlay a smooth curve on a scatterplot.
# Load dataset
data(iris)
class(iris); dim(iris); str(iris)
## [1] "data.frame"
## [1] 150   5
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Plot
ggplot(data=iris) +
  geom_point(mapping=aes(x=Sepal.Length, y=Petal.Length)) +
  geom_smooth(mapping=aes(x=Sepal.Length, y=Petal.Length))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'