# Set your working directory
### REPLACE WITH YOUR FILE PATH
setwd('/home/nicole/Documents/Teaching/Class_Documents/Lecture2')
# WINDOWS: setwd("c:/Documents/my/working/directory")
# MAC: setwd("/Users/yourname/working/directory")

Basic Plotting with ggplot2

ggplot2 is a powerful and easy to use plotting package available from the tidyverse (a set of R packages designed to work together and be easy to use.

Many ggplot2 resouces are available for free and online:

Graph objects in ggplot2

  • DATA to be visualized
  • GEOMetric objects that appear on the plot
  • AESthetic mappings from data to visual component
  • STATistics transform data on the way to visualization
  • COORDinates organize location of geometric objects
  • SCALEs define the range of values for aesthetics
  • FACETs group into subplots

ggplot2 uses Tidy Data. What is it?

  • Each variable forms a column
  • Each observation forms a row
  • Each observational unit forms a table

For the examples here we will use a simple dataset from the World Bank containg GDP per capita and Net Foreign Direct Investment.

df <- read.csv('WorldBank_GDP.csv')
head(df,25)
  • On the x axis -> Year
  • On the y axis -> GDP_pc or net_FDI
  • Shape, color, or group -> Country.Name

Syntax

The minimum required for a plot to draw is:

  • Data - the data frame you are referring to
  • Aesthetics - specifies the x and y of the plot, and optionally the ‘color’, ‘group’, or ‘fill’
  • Geom - the type of plot you wish to specify

Common types of geoms:

  • geom_line() - line plot
  • geom_density() - density plot
  • geom_histogram() - histogram
  • geom_point()- scatter plot
  • geom_boxplot() - boxplot
  • geom_bar() - bar plot
  • geom_smooth() - add in a regression line for point data

You can also add in geoms that annotate the plot:

  • geom_hline() - horizontal line
  • geom_vline() - vertical line
  • geom_text() - add text to any location on graph

See all the options int the ggplot2 reference files

# Require ggplot2
library(ggplot2)

# Plot GDP Per Capita
ggplot(df,aes(x=Year,
              y=GDP_pc,
              group=Country.Name,
              color=Country.Name)) +
  geom_line()

ggplot(df,aes(x=Year,
              y=GDP_pc,
              group=Country.Name,
              color=Country.Name)) +
  geom_point()

One Variable, Multiple Groups

Each geom has its own requirements - see the help tab in the side consol to look up what the geom needs. You can also assign a plot to a variable for re-use using <- and layer mutiple plots over one another

For stat, additional transformations like means, sums, etc.:

  • All respect stat = ‘identity’
  • Ex: geom_histogram uses stat_bin() to group observations
  • You can use stat_summary() for generating different statitics

For position, you adjust location of objects:

  • ‘dodge’, ‘stack’, ‘jitter’
p <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
            aes(x=Year,
                y=GDP_pc,
                group=Country.Name,
                color=Country.Name)) +
  geom_point(position='jitter')

# A grouped barplot
p1 <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
             aes(x=Year,
                 y=GDP_pc,
                 fill=Country.Name))+
  geom_bar(stat = "identity", position='dodge')

# A stacked barplot over time
p2 <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
             aes(x=Year,
                 y=GDP_pc,
                 fill=Country.Name)) +
  geom_bar(stat='identity')

# A percentage barplot over time
p3 <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
             aes(x=Year,
                 y=GDP_pc,
                 fill=Country.Name)) +
  geom_bar(stat='identity',
           position='fill')

p4 <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
             aes(x=Year)) +
  stat_summary(fun.y=mean,
               geom="line",
               aes(y=GDP_pc)) +
  stat_summary(fun.data = mean_se,
               geom="errorbar",
               aes(y=GDP_pc),
               size = .5,
               alpha=.7,
               color='Black')

# Printing out the plots
p

p1

p2

p3

p4

use facet_wrap() or facet_grid() to create multiple graphs base on group, rather than having each group on the same plot.

ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
            aes(x=Year,
                y=GDP_pc,
                group=Country.Name,
                color=Country.Name)) +
  geom_point()+
  geom_line()+
  facet_wrap(~ Country.Name)

Two Variables

To create two way graphs, you can either use data in long format (as above) or create individual geoms for each variable.

You can also create simple regression lines using geom_smooth(). The default method for geom_smooth() is ‘loess’, which computes a local regression for a small number of values.

# A cross-section scatterplot
ggplot(df[df$Year %in% 2012,],
       aes(x=GDP_pc,
           y=net_FDI,
           color=Country.Name))+
  geom_point(stat='identity',
             position='identity',size=3)

# A cross-section scatter plot using geom_smooth()
ggplot(df[df$Year %in% 2012,],
       aes(x=GDP_pc,
           y=net_FDI))+
  geom_point(stat='identity',
             position='identity',
             size=3) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Style

Labels and Color

To add in customized labels:

  • ggtitle(“New main title”): Adds a main title above the plot
  • xlab(“New X axis label”): Changes the X axis label
  • ylab(“New Y axis label”): Changes the Y axis label
  • labs(title = “New main title”, x = “New X axis label”, y = “New Y axis label”): Changes main title and axis labels

You can also change the color of any geom or stat by adding in the argument ‘color=YourColor’

You can also change the color of your legend labels depending on your AES settings using

  • scale_color_manual(values = c(“color1”, “color2”,….))
  • scale_fill_manual(values = c(“color1”, “color2”,….))
  • scale_discrete_manual(values = c(“color1”, “color2”,….))
# Add labels to and change color lines of previous plot
p <- ggplot(df[df$Country.Name %in% c('Mexico','Argentina','Brazil'),],
            aes(x=Year)) +
  stat_summary(fun.y=mean,
               geom="line",
               aes(y=GDP_pc),
               color='Navy') +
  stat_summary(fun.data = mean_se,
               geom="errorbar",
               aes(y=GDP_pc),
               size = .5,
               alpha=.7,
               color='grey25') +
  labs(title="Mean GDP Per Capita: Mexico, Argentina, Brazil",
       y = "Mean GDP Per Capita(US$)")

p

Themes

Theme() can change text size, edit legends, set colors etc.

Styles:

  • theme_bw()
  • theme_dark()
  • theme_gray()
  • theme_light()
  • theme_minimal()
  • theme_void()
  • theme_classic()
library(ggthemes)

# Some Examples
p + theme_classic()

p + theme_minimal()

# You can even make it look like STATA output
p + theme_stata() + scale_color_stata()

Outputting plots

Use the ggsave() function to output your plots

ggsave(
  filename = "my_plot.png",
  plot = p,
  width = 10,
  height = 8,
  dpi = 100,
  device = "png"
)