Advanced ggplotting requires many extra libraries. They are loaded as needed.

Start with loading some basic libraries

library(dplyr)
library(ggplot2)

Scatterplot

Load data from ggplot2 package

data("midwest", package = "ggplot2")
str(midwest)

## Classes 'tbl_df', 'tbl' and 'data.frame':    437 obs. of  28 variables:
##  $ PID                 : int  561 562 563 564 565 566 567 568 569 570 ...
##  $ county              : chr  "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
##  $ state               : chr  "IL" "IL" "IL" "IL" ...
##  $ area                : num  0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
##  $ poptotal            : int  66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
##  $ popdensity          : num  1271 759 681 1812 324 ...
##  $ popwhite            : int  63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
##  $ popblack            : int  1702 3496 429 127 547 50 1 111 16 16559 ...
##  $ popamerindian       : int  98 19 35 46 14 65 8 30 8 331 ...
##  $ popasian            : int  249 48 16 150 5 195 15 61 23 8033 ...
##  $ popother            : int  124 9 34 1139 6 221 0 84 6 1596 ...
##  $ percwhite           : num  96.7 66.4 96.6 95.3 90.2 ...
##  $ percblack           : num  2.575 32.9 2.862 0.412 9.373 ...
##  $ percamerindan       : num  0.148 0.179 0.233 0.149 0.24 ...
##  $ percasian           : num  0.3768 0.4517 0.1067 0.4869 0.0857 ...
##  $ percother           : num  0.1876 0.0847 0.2268 3.6973 0.1028 ...
##  $ popadults           : int  43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
##  $ perchsd             : num  75.1 59.7 69.3 75.5 68.9 ...
##  $ percollege          : num  19.6 11.2 17 17.3 14.5 ...
##  $ percprof            : num  4.36 2.87 4.49 4.2 3.37 ...
##  $ poppovertyknown     : int  63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
##  $ percpovertyknown    : num  96.3 99.1 95 98.5 82.5 ...
##  $ percbelowpoverty    : num  13.15 32.24 12.07 7.21 13.52 ...
##  $ percchildbelowpovert: num  18 45.8 14 11.2 13 ...
##  $ percadultpoverty    : num  11.01 27.39 10.85 5.54 11.14 ...
##  $ percelderlypoverty  : num  12.44 25.23 12.7 6.22 19.2 ...
##  $ inmetro             : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ category            : chr  "AAR" "LHR" "AAR" "ALU" ...

A basic scatterplot with different colors for each state and size of points proportional to population density:

ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=T)

Add pre-defined ranges for x and y axes, add proper titles etc

ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=T) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) + 
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot", 
       caption = "Source: midwest")

Scatterplot with geom_encircle

Encirlce certain special region in scatterplot using geom_encicle from ggalt package.

library(ggplot2)
library(ggalt)  #for geom_encircle()
require(dplyr)  #for filtering data 
#the points that are going to be encircled
midwest_select <- midwest %>% filter(poptotal > 350000 & 
                            poptotal <= 500000 & 
                            area > 0.01 & 
                            area < 0.1)


ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) +   # draw points
  geom_smooth(method="loess", se=T) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) +   # draw smoothing line
  geom_encircle(aes(x=area, y=poptotal), 
                data=midwest_select, 
                color="green", 
                size=2,          # line thickness 
                expand=0.06) +   # encircle 
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot + Encircle", 
       caption="Source: midwest")

Scatterplot with geom_count

A Scatterplot where size of points show the extent of overlapping of data. geom_count() is a variant of geom_point():

data(mpg, package="ggplot2")
ggplot(data=mpg, mapping=aes(x=cty, y=hwy)) +
    geom_count(col="tomato3", show.legend=F) +
    labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Counts Plot")

Lets plot the Same data using geom_count. Notice that the overlapping nature of points is not captured at all(can be done using geom_jitter)

data(mpg, package="ggplot2")
ggplot(data=mpg, mapping = aes(x=cty, y=hwy))+
    geom_point(col="tomato2", show.legend = F)+
    labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="geom_point Plot")

Marginal histogram/boxplot

Used to show relationship + distribution in the same plot

library(ggExtra)
data(mpg, package="ggplot2")

mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) + 
  geom_count(show.legend = F) + 
  geom_smooth(method="lm", se=F)

ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "density", fill="transparent")

Correlogram

Examine the corellation of multiple continuous variables present in the same dataframe. Lets plot a basic one

library(ggplot2)
library(ggcorrplot)

# Correlation matrix
data(mtcars)
#basic plot
ggcorrplot(cor(mtcars), type="lower", lab=T, 
           method="circle", 
           lab_size = 3)

A decorated plot

library(ggplot2)
library(ggcorrplot)

# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)

# Plot
ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of mtcars", 
           ggtheme=theme_bw)

Diverging bars

Diverging Bars is a bar chart that can handle both negative and positive values A z-score (aka, a standard score) indicates how many standard deviations an element is from the mean. A z-score can be calculated from the following formula.

z = (X - u) / d

where z is the z-score, X is the value of the element, u is the population mean, and d is the standard deviation

If the number of elements in the set is large, about 68% of the elements have a z-score between -1 and 1; about 95% have a z-score between -2 and 2; and about 99% have a z-score between -3 and 3.

data("mtcars")  # load data
#Pick rownames/car names
mtcars$`car name` <- rownames(mtcars)  # create new column for car names

# Normalize/standardize data
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2)  

#add new factor 'above' and 'below' for above/below average mileage
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above")  

#sort data based on mpg_z score
mtcars <- mtcars[order(mtcars$mpg_z), ]  

# convert to factor to retain sorted order in plot.
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`)  

# Diverging Bar chart
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + 
    # change default bar thickness from 1 to 0.5
    # geom_bar() with stat=identity is geom_col
    geom_col(aes(fill=mpg_type), width=.5)  +
    #display mpg_z value for each bar in black
    geom_text(color="black", size=3)+
    scale_fill_manual(name="Mileage", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="green", "below"="tomato")) + 
    labs(subtitle="Normalised mileage from 'mtcars'", 
       title= "Diverging Bars") + 
    #flip co-ordinate to show car names clearly/horizontally
    coord_flip()

Ordered Bar Chart

Ordered Bar Chart is a Bar Chart that is ordered by the Y axis variable

cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean)  # aggregate
colnames(cty_mpg) <- c("make", "mileage")  # change column names
cty_mpg <- cty_mpg[order(cty_mpg$mileage), ]  # sort

# to retain the order in plot, char must be converted to factor
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make)  

# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) + 
    #geom_bar(stat="identity", width=.5, fill="tomato3") + 
    #or
    geom_col(width=.5, fill="tomato3")+
    labs(title="Ordered Bar Chart", 
       subtitle="Make Vs Avg. Mileage", 
       x="Make",
       y="Mileage",
       caption="source: mpg") + 
    theme(axis.text.x = element_text(angle=65, vjust=0.7))

Or this can be achieved using dplyr and forcats:fct_order() method as shown below. Also show text value or each bar

require(forcats)    #for factor re-ordering

## Loading required package: forcats

mpg %>%
    group_by(manufacturer) %>%
    summarise(Mileage=mean(cty)) %>%
    #convert to factor to maintain sorted ordering
    #mutate(Make2=factor(manufacturer, levels = manufacturer)) %>%
    #mutate(Make=fct_reorder(Make2, Mileage))%>%
    #OR 
    #convert to factor and maintain re-ordering using 
    #forcats:fct_order method at one go
    mutate(Make=fct_reorder(manufacturer, Mileage))%>%
    ggplot(mapping=aes(x=Make, y=Mileage))+
        geom_col(width=0.5, fill="tomato2")+
        labs(title="Ordered bar chart", 
             subtitle="Make vs Avg. Mileage",
             x="Make", 
             y="Mileage") +
    #print value for each bar as well
    geom_text(color="black", size=4, vjust=-0.5, 
                  aes(label=sprintf("%0.1f", round(Mileage, digits = 2))))+
    theme(axis.text.x = element_text(angle = 65, vjust=0.7))

Diverging Lollipop Chart

Lollipop chart shows the same information as bar chart and diverging bar.

ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + 
  geom_point(stat='identity', fill="black", size=6)  +
  geom_segment(aes(y = 0, 
                   x = `car name`, 
                   yend = mpg_z, 
                   xend = `car name`), 
               color = "black") +
  geom_text(color="white", size=2) +
  labs(title="Diverging Lollipop Chart", 
       subtitle="Normalized mileage from 'mtcars': Lollipop") + 
  ylim(-2.5, 2.5) +
  coord_flip()

ggplot2 advanced - Part 1

Ahmed

Dec 13, 2017