This file provides supporting code for an introduction to graphing with ggplot2.

Types of graphs

Bar Graphs and Histograms for a Single Variable

ggplot(data = inequality) + 
  geom_bar(mapping = aes(x = cgv_dem)) ##shows the numerical frequency of each category of a variable
## Warning: Removed 2 rows containing non-finite values (stat_count).

ggplot(data = inequality) + 
  geom_bar(mapping = aes(x = cgv_dem, y=(..prop..)))  ##shows the percentage of the total rather than the count
## Warning: Removed 2 rows containing non-finite values (stat_count).

ggplot(data = inequality) + 
  geom_histogram(mapping = aes(wb_gdppc), bins=12)  ##bins is for the number of bars you want to represent the categorical data
## Warning: Removed 8 rows containing non-finite values (stat_bin).

Bar Graphs Comparing Two Variables

ggplot(data = inequality) + 
  geom_bar(mapping = aes(x = cgv_dem, y = colbrit), stat = "identity") 
## Warning: Removed 2 rows containing missing values (position_stack).

#shows the number of dictatorships/democracies that were british colonies

Scatterplots

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini))
## Warning: Removed 152 rows containing missing values (geom_point).

Line Graph

ggplot(data = inequality) + 
  geom_line(mapping = aes(x = wb_gdppc, y = wb_gini))
## Warning: Removed 23 rows containing missing values (geom_path).

Best Fit Line Graph

ggplot(data = inequality) + 
  geom_smooth(mapping = aes(x = wb_gdppc, y = wb_gini))
## `geom_smooth()` using method = 'loess'
## Warning: Removed 152 rows containing non-finite values (stat_smooth).

## Formatting

Color

Representing data with color

In bar graphs

ggplot(data = inequality) + 
  geom_bar(mapping = aes(x = cgv_dem, y = colbrit, fill=cgv_dem), 
           stat = "identity"
  )
## Warning: Removed 2 rows containing missing values (position_stack).

ggplot(inequality, aes(x=cgv_dem, y=colbrit)) +
  geom_bar(aes(fill=wb_gdppc), stat="identity")
## Warning: Removed 2 rows containing missing values (position_stack).

In scatterplots

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini, color=cgv_dem)) 
## Warning: Removed 152 rows containing missing values (geom_point).

Choosing colors for your color variable

By Palette

For a full list of Color Brewer palettes, go to: http://www.sthda.com/sthda/RDoc/figure/text-mining/word-cloud-generator-rcolorbrewer-palettes.png)

inequality$cgv_dem <- as.factor(inequality$cgv_dem)

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini, color=cgv_dem)) +
  scale_color_brewer(palette = "Spectral")
## Warning: Removed 152 rows containing missing values (geom_point).

Manually

Note that you set the colors for the values of the variable listed under “color”

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini, color=cgv_dem)) +
  scale_color_manual(values = c("0" = "black", "1" = "red", "NA" = "yellow"))
## Warning: Removed 152 rows containing missing values (geom_point).

Changing the colors of lines, dots, etc.

Note that color is OUTSIDE the aes function, unlike above.

ggplot(data = inequality) + 
  geom_smooth(mapping = aes(x = wb_gdppc, y = wb_gini), color="red", fill="blue", size=2)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 152 rows containing non-finite values (stat_smooth).

Labels

Titles, Captions, and Axis Labels

ggplot(data = inequality) +
  geom_smooth(mapping=aes(x=wb_gini, y=wb_gdppc)) +
  labs(                                                                              
    title = paste("The effect of Inequality on GDP"), 
    subtitle = paste("in 2000"),
    caption = ("Source: World Bank"),
    x = "Gini Coefficient",
    y = "GDP per capita (US$)"
    ) 
## `geom_smooth()` using method = 'loess'
## Warning: Removed 152 rows containing non-finite values (stat_smooth).

Axis Scales

Overall range

ggplot(data = inequality) +
  geom_point(mapping=aes(x=wb_gini, y=wb_gdppc)) +
  coord_cartesian(ylim = c(0, 10000))  
## Warning: Removed 152 rows containing missing values (geom_point).

Breaks within the range

ggplot(data = inequality) +
  geom_point(mapping=aes(x=wb_gini, y=wb_gdppc)) +
  coord_cartesian(ylim = c(0, 10000)) + 
  scale_y_continuous(breaks = seq(0, 10000, by = 2000)) 
## Warning: Removed 152 rows containing missing values (geom_point).

Points

To label all points in the scatterplot:

ggplot(data = inequality) + 
  geom_text(mapping = aes(x = wb_gini, y = wb_gdppc, label=country)) +
  coord_cartesian(ylim = c(0, 10000)) + 
  scale_y_continuous(breaks = seq(0, 10000, by = 2000))  
## Warning: Removed 152 rows containing missing values (geom_text).

library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.3.3
ggplot(data = inequality, mapping = aes(x = wb_gini, y = wb_gdppc)) + 
  geom_point() +
  geom_label_repel(aes(label=country)) +
  coord_cartesian(ylim = c(0, 10000)) + 
  scale_y_continuous(breaks = seq(0, 10000, by = 2000))  
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing missing values (geom_label_repel).

To label outliers in the plot, you should subset the data:

ggplot(inequality, aes(wb_gini, wb_gdppc)) +
  geom_point() +
  geom_text_repel(data = subset(inequality, wb_gdppc>15000), mapping = aes(label = country))
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 37 rows containing missing values (geom_text_repel).

Legends

Legend name:

ggplot(data = inequality) +
    geom_point(mapping=aes(x=wb_gini, y=wb_gdppc, colour=cgv_dem)) +
    coord_cartesian(ylim = c(0, 10000)) + 
    scale_y_continuous(breaks = seq(0, 10000, by = 2000)) +
  labs(                                                                              
    title = paste("The effect of inequality on GDP"), 
    subtitle = paste("in 2000"),
    caption = ("Source: World Bank, CGV"),
    x = "Gini Coefficient",
    y = "GDP per capita (US$)",
    colour = "Democracy"
  ) 
## Warning: Removed 152 rows containing missing values (geom_point).

Legend position:

  ggplot(data = inequality) +
    geom_point(mapping=aes(x=wb_gini, y=wb_gdppc, colour=cgv_dem)) +
    coord_cartesian(ylim = c(0, 10000)) + 
    scale_y_continuous(breaks = seq(0, 10000, by = 2000)) +
    labs(                                                                              
      title = paste("The effect of inequality on GDP"), 
      subtitle = paste("in 2000"),
      caption = ("Source: World Bank, CGV"),
      x = "Gini Coefficient",
      y = "GDP per capita (US$)",
      colour = "Democracy"
    )  +
    theme(legend.position = "bottom")
## Warning: Removed 152 rows containing missing values (geom_point).

To change the range of the legend/label values, you need to change the underlying variable:

inequality$cgv_dem <- as.factor(inequality$cgv_dem)
levels(inequality$cgv_dem) = c("No", "Yes", "N/A")
    
ggplot(data = inequality) +
  geom_point(mapping=aes(x=wb_gini, y=wb_gdppc, colour=cgv_dem)) +
  coord_cartesian(ylim = c(0, 10000)) + 
  scale_y_continuous(breaks = seq(0, 10000, by = 2000)) +
  labs(                                                                              
    title = paste("The effect of inequality on GDP"), 
    subtitle = paste("in 2000"),
    caption = ("Source: World Bank, CGV"),
    x = "Gini Coefficient",
    y = "GDP per capita (US$)",
    colour = "Democracy"
  )  +
  theme(legend.position = "bottom")
## Warning: Removed 152 rows containing missing values (geom_point).

Layering Graphs

To include both points and line, add them together:

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini)) + 
  geom_smooth(mapping = aes(x = wb_gdppc, y = wb_gini)) 
## `geom_smooth()` using method = 'loess'
## Warning: Removed 152 rows containing non-finite values (stat_smooth).
## Warning: Removed 152 rows containing missing values (geom_point).

To add a reference line:

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini)) + 
  geom_hline(aes(yintercept=0)) 
## Warning: Removed 152 rows containing missing values (geom_point).

ggplot(data = inequality) + 
  geom_point(mapping = aes(x = wb_gdppc, y = wb_gini)) + 
  geom_vline(aes(xintercept=0)) 
## Warning: Removed 152 rows containing missing values (geom_point).

Graph a Regression

ggplot(data = inequality) +
  geom_point(mapping=aes(x=wb_gini, y=wb_gdppc)) + 
  geom_smooth(mapping=aes(x=wb_gini, y=wb_gdppc), method='lm', formula= y ~ x) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000)) +  
  labs(                                                                              
    title = paste("The effect of Inequality on GDP per capita"), 
    subtitle = paste("in 2000"),
    caption = ("Source: World Bank"),
    x = "Gini Coefficient",
    y = "GDP per capita (US$)") +
  coord_cartesian(ylim = c(0, 12000))  
## Warning: Removed 152 rows containing non-finite values (stat_smooth).
## Warning: Removed 152 rows containing missing values (geom_point).