Setup: install the tidyverse package

# install.packages("tidyverse")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages ------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.3.0
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts --------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Example Data: Housing prices

Let’s look at housing prices.

housing<-read_csv("C:/Users/atlew/Desktop/R Files/landdata-states.csv")
## Parsed with column specification:
## cols(
##   State = col_character(),
##   region = col_character(),
##   Date = col_double(),
##   Home.Value = col_double(),
##   Structure.Cost = col_double(),
##   Land.Value = col_double(),
##   Land.Share..Pct. = col_double(),
##   Home.Price.Index = col_double(),
##   Land.Price.Index = col_double(),
##   Year = col_double(),
##   Qrtr = col_double()
## )
head(housing[1:5])
## # A tibble: 6 x 5
##   State region  Date Home.Value Structure.Cost
##   <chr> <chr>  <dbl>      <dbl>          <dbl>
## 1 AK    West   2010.     224952         160599
## 2 AK    West   2010.     225511         160252
## 3 AK    West   2010.     225820         163791
## 4 AK    West   2010      224994         161787
## 5 AK    West   2008      234590         155400
## 6 AK    West   2008.     233714         157458

ggplot2 VS Base for simple graphs

Base graphics histogram example:

housing<-read_csv("C:/Users/atlew/Desktop/R Files/landdata-states.csv")
## Parsed with column specification:
## cols(
##   State = col_character(),
##   region = col_character(),
##   Date = col_double(),
##   Home.Value = col_double(),
##   Structure.Cost = col_double(),
##   Land.Value = col_double(),
##   Land.Share..Pct. = col_double(),
##   Home.Price.Index = col_double(),
##   Land.Price.Index = col_double(),
##   Year = col_double(),
##   Qrtr = col_double()
## )
head(housing[1:5])
## # A tibble: 6 x 5
##   State region  Date Home.Value Structure.Cost
##   <chr> <chr>  <dbl>      <dbl>          <dbl>
## 1 AK    West   2010.     224952         160599
## 2 AK    West   2010.     225511         160252
## 3 AK    West   2010.     225820         163791
## 4 AK    West   2010      224994         161787
## 5 AK    West   2008      234590         155400
## 6 AK    West   2008.     233714         157458
hist(housing$Home.Value) 

ggplot2 histogram example:

library(ggplot2)
 
ggplot(housing, aes(x= Home.Value)) +
   geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##`stat_bin()` using `bins=30`. Pick better value with `binwidth`. 

ggplot2 Base graphics VS ggplot for more complex graphs:

Base graphics colored scatter plot example:

plot(Home.Value ~ Date,
     col=factor(State),
     data=filter(housing, State %in% c("MA", "TX")))
## Warning: package 'bindrcpp' was built under R version 3.5.2
legend("topleft", 
       legend=c("MA", "TX"),
       col=c("black", "red"),
       pch=1)

ggplot2 colored scatter plot example:

ggplot(filter(housing, State %in% c("MA", "TX")),
       aes(x=Date,
           y=Home.Value,
           color=State))+
  geom_point()

ggplot2 wins!

Geometric Objects And Aesthetics

You can get a list of available geometric objects using the code below:

help.search("geom_", package="ggplot2")
## starting httpd help server ... done

Points (Scatterplot)

Now that we know about geometric objects and aesthetic mapping, we can make a ggplot. geom_point requires mappings for x and y, all others are optional.

hp2001Q1<-filter(housing, Date==2001.25)
ggplot(hp2001Q1,
       aes(y=Structure.Cost, x=Land.Value))+
  geom_point(color="blue")

ggplot(hp2001Q1,
       aes(y=Structure.Cost, x=log(Land.Value))) +
  geom_point(color="blue")

Lines (Prediction Line)

A plot constructed with ggplot can have more than one geom. In that case the mappings established in the ggplot() call are plot defaults that can be added to or overridden. Our plot could use a regression line:

hp2001Q1$pred.SC<-predict(lm(Structure.Cost ~ log(Land.Value), data=hp2001Q1))
p1<-ggplot(hp2001Q1, aes(x=log(Land.Value), y=Structure.Cost))
p1 + geom_point(aes(color=Home.Value))+
  geom_line(aes(y=pred.SC))

Smoothers

Not all geometric objects are simple shapes–the smooth geom includes a line and a ribbon.

p1+
  geom_point(aes(color=Home.Value))+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 'geom_smooth()' using method = 'loess' and formula 'y ~ x'

Text (Label Points)

Each geom accepts a particualar set of mappings;for example geom_text() accepts a labels mapping.

p1 +
  geom_text(aes(label=State), size=3)

## install.packages("ggrepel") 

library("ggrepel")
## Warning: package 'ggrepel' was built under R version 3.5.2
p1+
  geom_point() +
  geom_text_repel(aes(label=State), size=3)

Aesthetic Mapping VS Assignment

Note that variables are mapped to aesthetics with the aes() function, while fixed aesthetics are set outside the aes() call. This sometimes leads to confusion, as in this example:

p1+
  geom_point(aes(size=2), #incorrect! 2 is not a variable
             color="red") #this is fine--all points red

Mapping Variables To Other Aesthetics

Other aesthetics are mapped in the same way as x and y in the previous example.

p1+
  geom_point(aes(color=Home.Value, shape=region))
## Warning: Removed 1 rows containing missing values (geom_point).

##Warning: Removed 1 rows containing missing values (geom_point). 

Exercise I

The data for the exercises is available in the dataSets/EconomistData.csv file. Read it in with

dat<-read.csv("C:/Users/atlew/Desktop/R Files/EconomistData.csv")

These data consist of Human Development Index and Corruption Perception Index scores for several countries.

  1. Create a scatter plot with CPI on the x axis and HDI on the y axis.
  2. Color the points blue.
  3. Map the color of the the points to Region.
  4. Make the points bigger by setting size to 2
  5. Map the size of the points to HDI.Rank

Exercise I prototype :prototype:

  1. Create a scatter plot with CPI on the x axis and HDI on the y axis.
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point()

  1. Color the points in the previous plot blue.
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point(color="blue")

  1. Color the points in the previous plot according to Region.
ggplot(dat, aes(x=CPI, y=HDI, color=Region))+
  geom_point()

  1. Make the points bigger by setting size to 2
ggplot(dat, aes(x=CPI, y=HDI, color=Region))+
  geom_point(size=2)

  1. Make the points bigger by setting size to 2

Statistical Transformations

Statistical Transformations

Some plot types (such as scatterplots) do not require transformations–each point is plotted at x and y coordinates equal to the original value. Other plots, such as boxplots, histograms, prediction lines etc. require statistical transformations:

  • for a boxplot the y values must be transformed to the median and 1.5(IQR)
  • for a smoother smother the y values must be transformed into predicted values

Each geom has a default statistic, but these can be changed. For example, the default statistic for geom_bar is stat_bin:

args(geom_histogram)
## function (mapping = NULL, data = NULL, stat = "bin", position = "stack", 
##     ..., binwidth = NULL, bins = NULL, na.rm = FALSE, show.legend = NA, 
##     inherit.aes = TRUE) 
## NULL
args(stat_bin)
## function (mapping = NULL, data = NULL, geom = "bar", position = "stack", 
##     ..., binwidth = NULL, bins = NULL, center = NULL, boundary = NULL, 
##     breaks = NULL, closed = c("right", "left"), pad = FALSE, 
##     na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) 
## NULL
## function (mapping=NULL, data=NULL, geom="bar", postition="stack")
## ..., binwidth=NULL, bins=NULL, center=NULL, boundary=NULL,
## breaks=NULL, closed=c("right", "left"), pad=FALSE,
## na.rm=FALSE, show.legend=NA, inherit.aes=TRUE)
## NULL

Setting Statistical Transformation Arguments

Arguments to stat_ functions can be passed through geom_ functions. This can be slightly annoying because in order to change it you have to first determine which stat the geom uses, then determine the arguments to that stat.

For example, here is the default histogram of Home.Value:

p2<-ggplot(housing, aes(x=Home.Value))
p2+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins=30`. Pick better value with `binwidth`.

can change it by passing the binwidth argument to the stat_bin function:

p2+geom_histogram(stat="bin", binwidth=4000) 

Changing The Statistical Transformation

Sometimes the default statistical transformation is not what you need. This is often the case with pre-summarized data:

housing.sum<-aggregate(housing["Home.Value"], housing["State"], FUN=mean)
rbind(head(housing.sum), tail(housing.sum))
##    State Home.Value
## 1     AK  147385.14
## 2     AL   92545.22
## 3     AR   82076.84
## 4     AZ  140755.59
## 5     CA  282808.08
## 6     CO  158175.99
## 46    VA  155391.44
## 47    VT  132394.60
## 48    WA  178522.58
## 49    WI  108359.45
## 50    WV   77161.71
## 51    WY  122897.25
ggplot(housing.sum, aes(x=State, y=Home.Value))+
  geom_bar()
## Error: stat_count() must not be used with a y aesthetic.

What is the problem with the previous plot? Basically we take binned and summarized data and ask ggplot to bin and summarize it again (remember, geom_bar defaults to stat = stat_count); obviously this will not work. We can fix it by telling geom_bar to use a different statistical transformation function:

ggplot(housing.sum, aes(x=State, y=Home.Value))+
  geom_bar(stat="identity") 

Exercise II

  1. Re-create a scatter plot with CPI on the x axis and HDI on the y axis (as you did in the previous exercise).
  2. Overlay a smoothing line on top of the scatter plot using geom_smooth.
  3. Overlay a smoothing line on top of the scatter plot using geom_smooth, but use a linear model for the predictions. Hint: see ?stat_smooth.
  4. Overlay a smoothing line on top of the scatter plot using geom_line. Hint: change the statistical transformation.
  5. BONUS: Overlay a smoothing line on top of the scatter plot using the default loess method, but make it less smooth. Hint: see ?loess.

Exercise II prototype :prototype:

  1. Re-create a scatter plot with CPI on the x axis and HDI on the y axis (as you did in the previous exercise).
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point() 

  1. Overlay a smoothing line on top of the scatter plot using geom_smooth
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  1. Overlay a smoothing line on top of the scatter plot using geom_smooth, but use a linear model for the predictions. Hint: see ?stat_smooth.
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point()+
  geom_smooth(method="lm")

  1. Overlay a loess (method = “loess”) smoothling line on top of the scatter plot using geom_line. Hint: change the statistical transformation.
ggplot(dat, aes(x=CPI, y=HDI))+
  geom_point()+
  geom_smooth(method="lm")+
  geom_line(method="loess")
## Warning: Ignoring unknown parameters: method

  1. BONUS: Overlay a smoothing line on top of the scatter plot using the loess method, but make it less smooth. Hint: see ?loess.

Scales

Scales: Controlling Aesthetic Mapping

Aesthetic mapping (i.e., with aes()) only says that a variable should be mapped to an aesthetic. It doesn’t say how that should happen. For example, when mapping a variable to shape with aes(shape = x) you don’t say what shapes should be used. Similarly, aes(color = z) doesn’t say what colors should be used. Describing what colors/shapes/sizes etc. to use is done by modifying the corresponding scale. In ggplot2 scales include

  • position
  • color and fill
  • size
  • shape
  • line type

Scales are modified with a series of functions using a scale_<aesthetic>_<type> naming scheme. Try typing scale_<tab> to see a list of scale modification functions.

Common Scale Arguments

The following arguments are common to most scales in ggplot2:

  • name: the first argument gives the axis or legend title
  • limits: the minimum and maximum of the scale
  • breaks: the points along the scale where labels should appear
  • labels: the labels that appear at each break

Specific scale functions may have additional arguments; for example, the scale_color_continuous function has arguments low and high for setting the colors at the low and high end of the scale.

Scale Modification Examples

Start by constructing a dotplot showing the distribution of home values by Date and State.

p3<-ggplot(housing,
           aes(x=State,
               y=Home.Price.Index))+
  theme(legend.position="top",
        axis.text=element_text(size=6))
(p4<-p3+geom_point(aes(color=Date),
                  alpha=0.5,
                  size=1.5,
                  poisition=position_jitter(width=0.25,height=0)))
## Warning: Ignoring unknown parameters: poisition

Now modify the breaks for the x axis and color scales

p4+scale_x_discrete(name="State Abbreviation")+
  scale_color_continuous(name="",
                         breaks=c(1976, 1994, 2013),
                         labels=c("'76", "'94", "'13"))

Next change the low and high values to blue and red:

p4+
  scale_x_discrete(name="State Abreviation")+
  scale_color_continuous(name="",
                         breaks=c(1976, 1994, 2013),
                         labels=c("'76", "'94", "'13"),
                         low="blue", high="red")

library(scales)
## Warning: package 'scales' was built under R version 3.5.2
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
p4+
  scale_color_continuous(name="",
                         breaks=c(1976, 1994, 2013),
                         labels=c("'76", "'94", "'13"),
                         low=muted("blue"), high=muted("red"))

Using different color scales

ggplot2 has a wide variety of color scales; here is an example using scale_color_gradient2 to interpolate between three different colors.

p4+
  scale_color_gradient2(name="",
                        breaks=c(1976, 1994, 2013),
                        labels=c("'76", "'94", "'13"),
                        low=muted("blue"),
                        high=muted("red"),
                        mid="gray60",
                        midpoint=1994)

Exercise III

  1. Create a scatter plot with CPI on the x axis and HDI on the y axis. Color the points to indicate region.
  2. Modify the x, y, and color scales so that they have more easily-understood names (e.g., spell out “Human development Index” instead of “HDI”).
  3. Modify the color scale to use specific values of your choosing. Hint: see ?scale_color_manual.

Exercise III prototype :prototype:

  1. Create a scatter plot with CPI on the x axis and HDI on the y axis. Color the points to indicate region.
ggplot(dat, aes(x=CPI, y=HDI, color=Region))+
  geom_point() 

  1. Modify the x, y, and color scales so that they have more easily-understood names (e.g., spell out “Human development Index” instead of “HDI”).
ggplot(dat, aes(x=CPI, y=HDI, color=Region))+
  scale_x_discrete(name="Corruption Perception Index")+
  scale_y_discrete(name="Human Development Index")+
  scale_color_discrete(name="",
                       breaks=c("Americas", "Asia Pacific", "East EU Cemt Asia", "EU W. Europe", "MENA", "SSA"),
                       labels=c("Americas", "Asia Pacific", "East Europe Central Asia", "Western Europe", "Middle East and North Africa", "Sub-Saharah Africa"))+
  geom_point()

  1. Modify the color scale to use specific values of your choosing. Hint: see ?scale_color_manual.
ggplot(dat, aes(x=CPI, y=HDI, color=HDI))+
  scale_color_continuous(name="",
                         breaks=c(0.4, 0.6, 0.8),
                         labels=c("0.4", "0.6", "0.8"))+
  geom_point()

Faceting

Faceting

  • Faceting is ggplot2 parlance for small multiples
  • The idea is to create separate graphs for subsets of data
  • ggplot2 offers two functions for creating small multiples:
    1. facet_wrap(): define subsets as the levels of a single grouping variable
    2. facet_grid(): define subsets as the crossing of two grouping variables
  • Facilitates comparison among plots, not just of geoms within a plot

What is the trend in housing prices in each state?

  • Start by using a technique we already know–map State to color:
p5<-ggplot(housing, aes(x=Date,y=Home.Value))
p5+geom_line(aes(color=State))

There are two problems here–there are too many states to distinguish each one by color, and the lines obscure one another.

Faceting to the rescue

We can remedy the deficiencies of the previous plot by faceting by state rather than mapping state to color.

(p5<-p5+geom_line()+
   facet_wrap(~State, ncol=10)) 

There is also a facet_grid() function for faceting in two dimensions.

Themes

Themes

The ggplot2 theme system handles non-data plot elements such as

  • Axis labels
  • Plot background
  • Facet label backround
  • Legend appearance

Built-in themes include:

  • theme_gray() (default)
  • theme_bw()
  • theme_classc()
p5+theme_linedraw()

p5+theme_light() 

Overriding theme defaults

Specific theme elements can be overridden using theme(). For example:

p5+theme_minimal()+
  theme(text=element_text(color="turquoise"))

All theme options are documented in ?theme.

Creating and saving new themes

You can create new themes, as in the following example:

theme_new<-theme_bw()+
  theme(plot.background=element_rect(size=1, color="blue", fill="black"),
        text=element_text(size=12, family="Serif", color="ivory"),
        axis.text.y=element_text(color="purple"),
        axis.text.x=element_text(color="red"),
        panel.background=element_rect(fill="pink"),
        strip.background=element_rect(fill=muted("orange")))
p5+theme_new
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

The #1 FAQ

Map Aesthetic To Different Columns

The most frequently asked question goes something like this: I have two variables in my data.frame, and I’d like to plot them as separate points, with different color depending on which variable it is. How do I do that?

Wrong

housing.byyear<-aggregate(cbind(Home.Value, Land.Value) ~Date, data=housing,mean)

Putting It All Together

Challenge: Recreate This Economist Graph

Graph source: http://www.economist.com/node/21541178

Building off of the graphics you created in the previous exercises, put the finishing touches to make it as close as possible to the original economist graph.

Challenge Solution :prototype:

Lets start by creating the basic scatter plot, then we can make a list of things that need to be added or changed. The basic plot loogs like this:

dat<-read_csv("EconomistData.csv")
## Parsed with column specification:
## cols(
##   Country = col_character(),
##   HDI.Rank = col_double(),
##   HDI = col_double(),
##   CPI = col_double(),
##   Region = col_character()
## )
pc1<-ggplot(dat, aes(x=CPI, y=HDI, color=Region))
pc1+ geom_point()

To complete this graph we need to:

  • [ ] add a trend line
  • [ ] change the point shape to open circle
  • [ ] change the order and labels of Region
  • [ ] label select points
  • [ ] fix up the tick marks and labels
  • [ ] move color legend to the top
  • [ ] title, label axes, remove legend title
  • [ ] theme the graph with no vertical guides
  • [ ] add model R2 (hard)
  • [ ] add sources note (hard)
  • [ ] final touches to make it perfect (use image editor for this)

Adding the trend line

Adding the trend line is not too difficult, though we need to guess at the model being displyed on the graph. A little bit of trial and error leads to

pc2<-pc1 +
  geom_smooth(mapping=aes(linetype="r2"),
              method="lm",
              formula=y~x+log(x), se=FALSE,
              color="red")
pc2+geom_point()

Notice that we put the geom_line layer first so that it will be plotted underneath the points, as was done on the original graph.

Use open points

This one is a little tricky. We know that we can change the shape with the shape argument, what what value do we set shape to? The example shown in ?shape can help us:

## A look at all 25 symbols
df2<-data.frame(x=1:5, y=1:25, z=1:25)
s<-ggplot(df2, aes(x=x, y=y))
s+geom_point(aes(shape=z), size=4)+scale_shape_identity()

## While all symbols have a foreground color, symbols 19-25 also take a background color (fill)
s+geom_point(aes(shape=z), size=4, color="red")+
  scale_shape_identity()

s+geom_point(aes(shape=z), size=4, color="red", fill="black")+
  scale_shape_identity()

This shows us that shape 1 is an open circle, so

pc2+
  geom_point(shape=1, size=4)

That is better, but unfortunately the size of the line around the points is much narrower than on the original.

(pc3<-pc2+geom_point(shape=1, size=2.5, stroke=1.25)) 

Labelling points

This one is tricky in a couple of ways. First, there is no attribute in the data that separates points that should be labelled from points that should not be. So the first step is to identify those points.

pointsToLabel<-c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan", "Afghanistan", "Congo", "Greece", "Argentina", "Brazil", "India", "Italy", "China", "South Africa", "Spain", "Botswana", "Cape Verde", "Bhutan", "Rwanda", "France", "United States", "Germany", "Britain", "Barbados", "Norway", "Japan", "New Zealand", "Singapore")

Now we can label these points using geom_text, like this:

(pc4<-pc3+
   geom_text(aes(label=Country),
             color="gray20",
             data=filter(dat, Country %in% pointsToLabel))) 

This more or less gets the information across, but the labels overlap in a most unpleasing fashion. We can use the ggrepel package to make things better, but if you want perfection you will probably have to do some hand-adjustment.

library("ggrepel")
(pc4<-pc3+
    geom_text_repel(aes(label=Country),
                    color="gray20",
                    data=filter(dat, Country %in% pointsToLabel),
                    force=10))

Change the region labels and order

Thinkgs are starting to come together. There are just a couple more things we need to add, and then all that will be left are themeing changes.

Comparing our graph to the original we notice that the labels and order of the Regions in the color legend differ. To correct this we need to change both the labels and order of the Region variable. We can do this with the factor function.

dat$Region<-factor(dat$Region,
                   levels=c("EU W. Europe",
                            "Americas",
                            "Asia Pacific",
                            "East EU Cemt Asia",
                            "MENA",
                            "SSA"),
                   labels=c("OCED",
                            "Americas",
                            "Asia &\nOceania",
                            "Central &\nEastern Europe",
                            "Middle East &\nnorth Africa",
                            "Sub-Saharn \nAfrica"))

Now when we construct the plot using these data the order should appear as it does in the original.

pc4$data<-dat
pc4

Add title and format axes

The next step is to add the title and format the axes. We do that using the scales system in ggplot2.

 library(grid)
(pc5<-pc4+
    scale_x_continuous(name="Corruption Perceptions Index, 2011 (10=least corrupt)",
                       limits=c(.9,10.5),
                       breaks=1:10)+
    scale_y_continuous(name="Human Development Index, 2011 (1=Best)",
                       limits=c(0.2, 1.0))+
    scale_color_manual(name="",
                       values=c("#24576D",
                                "#099DD7",
                                "#28AADC",
                                "#248E84",
                                "#F2583F",
                                "#96503F"))+
    ggtitle("Corruption and Human Development"))

Theme tweaks

Our graph is almost there. To finish up, we need to adjust some of the theme elements, and label the axes and legends. This part usually involves some trial and error as you figure out where things need to be positioned. To see what these various theme settings do you can change them and observe the results.

library(grid) # for the 'unit' function
(pc6<-pc5+
    theme_minimal()+ # start with a minimal theme and add what we need
    theme(text=element_text(color="gray20"),
          legend.position=c("top"), # position the legend in the upper left
          legend.direction="horizontal",
          legend.justification=0.1, # anchor point for the legend.position.
          legend.text=element_text(size=11, color="gray10"),
          axis.text=element_text(face="italic"),
          axis.title.x=element_text(vjust=-1), # move title away from axis
          axis.title.y=element_text(vjust=2), # move away for axis
          axis.ticks.y=element_blank(), # element_blank() is how we remove elements
          axis.line=element_line(color="gray40", size=0.5),
          axis.line.y=element_blank(),
          panel.grid.major=element_line(color="gray50", size=0.5),
          panel.grid.major.x=element_blank()
    ))

Add model R2 and source note

The last bit of information that we want to have on the graph is the variance explained by the model represented by the trend line. Lets fit that model and pull out the R2 first, then think about how to get it onto the graph.

mR2<-summary(lm(HDI~CPI+log(CPI), data=dat))$r.squared
mR2<-paste0(format(mR2, digits=2), "%")

OK, now that we’ve calculated the values, let’s think about how to get them on the graph. ggplot2 has an annotate function, but this is not convenient for adding elements outside the plot area. The grid package has nice functions for doing this, so we’ll use those.

And here it is, our final version!

p<-ggplot(dat,
          mapping=aes(x=CPI, y=HDI))+
  geom_smooth(mapping=aes(linetype="r2"),
              method="lm",
              formula=y~x+log(x), se=FALSE,
              color="red")+
  geom_point(mapping=aes(color=Region),
             shape=1,
             size=4,
             stroke=1.5)+
  geom_text_repel(mapping=aes(label=Country, alpha=labels),
                  color="gray20",
                  data=transform(dat,
                                 labels=Country %in% c("Russia",
                                                       "Venezuela",
                                                       "Iraq",
                                                       "Mayanmar",
                                                       "Sudan",
                                                       "Afghanistan",
                                                       "Congo",
                                                       "Greece",
                                                       "Argentina",
                                                       "Italy",
                                                       "Brazil",
                                                       "India",
                                                       "China",
                                                       "South Africa",
                                                       "Spain",
                                                       "Cape Verde",
                                                       "Bhutan",
                                                       "Rwanda",
                                                       "France",
                                                       "Botswana",
                                                       "US",
                                                       "Germany",
                                                       "Britain",
                                                       "Barbados",
                                                       "Japan",
                                                       "Norway",
                                                       "New Zealand",
                                                       "Singapore")))+
  scale_x_continuous(name="Corruption Perception Index, 2011 (10=least corrupt)",
                     limits=c(1.0, 10.0),
                     breaks=1:10)+
  scale_y_continuous(name="Human Development Index, 2011 (1=best)",
                     limits=c(0.2, 1.0),
                     breaks=seq(0.2, 1.0, by=0.1))+
  scale_color_manual(name="",
                     values=c("#24576D",
                              "#099DD7",
                              "#28AADC",
                              "#248E84",
                              "#F2583F",
                              "#96503F"),
                     guide=guide_legend(nrow=1, order=1))+
  scale_alpha_discrete(range=c(0,1),
                       guide=FALSE)+
  scale_linetype(name="",
                 breaks="r2",
                 labels=list(bquote(R^2==.(mR2))),
                 guide=guide_legend(override.aes=list(linetype=1, size=2, color="red"), order=2))+
  ggtitle("Corruption and human development")+
  labs(caption="Sources: Transparency International; UN Human Development Report")+
  theme_bw()+
  theme(panel.border=element_blank(),
        panel.grid=element_blank(),
        panel.grid.major.y=element_line(color="gray"),
        text=element_text(color="gray20"),
        axis.title.x=element_text(face="italic"),
        axis.title.y=element_text(face="italic"),
        legend.position="top",
        legend.direction="horizontal",
        legend.box="horizontal",
        legend.text=element_text(size=12),
        plot.caption=element_text(hjust=0),
        plot.title=element_text(size=16, face="bold"))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.
p

dev.off()
## null device 
##           1
## png
## 2

Comparing it to the original suggests that we’ve got most of the important elements.