STAT3000 091824

In this section, we will learn how to make presentation ready plots using ggplot2 and base R. We will learn how to use ggplot to make presentations.

lapply(c("ggplot2","readr","tidyverse","RColorBrewer"),library,character.only=1) #load multiple packages in one line

## [[1]]
## [1] "ggplot2"   "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "readr"     "ggplot2"   "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "tidyr"    
##  [7] "tibble"    "tidyverse" "readr"     "ggplot2"   "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "RColorBrewer" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "tidyr"        "tibble"       "tidyverse"    "readr"       
## [11] "ggplot2"      "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"

mort_dataframe <- read.csv("~/indicatordeadkids35.csv",check.names = FALSE)
# mort_tibble <- read_csv("./indicatordeadkids35.csv")
str(mort_dataframe)

## 'data.frame':    197 obs. of  255 variables:
##  $     : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ 1760: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1761: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1762: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1763: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1764: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1765: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1766: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1767: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1768: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1769: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1770: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1771: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1772: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1773: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1774: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1775: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1776: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1777: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1778: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1779: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1780: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1781: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1782: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1783: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1784: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1785: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1786: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1787: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1788: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1789: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1790: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1791: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1792: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1793: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1794: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1795: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1796: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1797: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1798: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1799: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 1800: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1801: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1802: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1803: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1804: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1805: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1806: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1807: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1808: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1809: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1810: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1811: num  NA 2.36 NA 4.44 3.7 ...
##  $ 1812: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1813: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1814: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1815: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1816: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1817: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1818: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1819: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1820: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1821: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1822: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1823: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1824: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1825: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1826: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1827: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1828: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1829: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1830: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1831: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1832: num  NA 2.36 NA 4.45 3.7 ...
##  $ 1833: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1834: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1835: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1836: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1837: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1838: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1839: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1840: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1841: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1842: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1843: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1844: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1845: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1846: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1847: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1848: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1849: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1850: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1851: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1852: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1853: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1854: num  NA 2.36 NA 4.46 3.7 ...
##  $ 1855: num  NA 2.36 NA 4.47 3.7 ...
##  $ 1856: num  NA 2.36 NA 4.47 3.7 ...
##  $ 1857: num  NA 2.36 NA 4.47 3.7 ...
##   [list output truncated]

We will need to prepare this dataset first by

giving the first column a descriptive column name,

names(mort_dataframe)[1] <- "country"
#colnames
# rename(mort_tibble, country = "...1")

transform to a long version of the dataset,

long <- pivot_longer(mort_dataframe,cols=-country,names_to="year",values_to="morts") 
str(long)

## tibble [50,038 × 3] (S3: tbl_df/tbl/data.frame)
##  $ country: chr [1:50038] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year   : chr [1:50038] "1760" "1761" "1762" "1763" ...
##  $ morts  : num [1:50038] NA NA NA NA NA NA NA NA NA NA ...

convert the new year column to numeric type.

long <- mutate(long,year=as.numeric(year))
str(long)

## tibble [50,038 × 3] (S3: tbl_df/tbl/data.frame)
##  $ country: chr [1:50038] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year   : num [1:50038] 1760 1761 1762 1763 1764 ...
##  $ morts  : num [1:50038] NA NA NA NA NA NA NA NA NA NA ...

Let’s doing some analysis.

Compute the average mortality rate per country

avg_country <- summarize(group_by(long,country),am = mean(morts,na.rm=TRUE))

Compute the average world mortality rate per year

avg_year<- summarize(group_by(long,year),ay = mean(morts,na.rm=TRUE))

Plotting with ggplot2

Now, we will learn how to use the more general ggplot function to create layered graphics.

When creating a plot, there are two essential attributes of the plot you need to specify: aesthetics and geoms

Aesthetics are mappings between the variables in the data and visual properties in the plots. Aesthetics are set in the aes() function and the most common aesthetics are

x
y
color
size
fill
shape
linetype
group

If you set these in aes, then you set them to a variable. If you want to set them for all values, set them in a geom.

The other essential element of a ggplot is a geom layer to determine how the data will be plotted.

geom_point - add points
geom_line - add lines
geom_density - add density plot
geom_histogram - add a histogram
geom_smooth - add a smoother
geom_boxplot - add a boxplot
geom_bar - add a bar chart
geom_tile - rectangles/heatmaps

Let’s look at the mortality rate is Sweden over time in scatterplot.

sweden_long =  filter(long,country == "Sweden")
g = ggplot(sweden_long, aes(x = year, y = morts))
g

g + geom_point()

g + geom_line()

g + geom_line() + geom_smooth()

#### try others

g + geom_area()

g + geom_violin()

If we want to change the data we are using in the plot, we need to make a new call to ggplot. For example, now let’s look at the mortality rates over time using line plots for each of the countries: United States, United Kingdom, Sweden, Afghanistan, Rwanda. To get a line for each country individually, we need to specify the group aesthetic and map it to the country variable.

sub <-   filter(long, country %in% c("United States", "United Kingdom", "Sweden","Afghanistan", "Rwanda"))

g <- ggplot(sub,aes(x = year, y = morts, group = country))
g  + geom_line()

Note that we have a single plot with a trajectory over time of the mortality rates for each of these five countries, but we cannot tell which country corresponds to which line. We will see how to fix this by using color and a legend in the upcoming sections.

gg <- g + geom_line() + 
  labs(x = "Year", y = "Mortality Rate", title = "Child Mortality Rates",
      subtitle = "Stratified by Country")
gg

The x and y axis limits can be adjusted using the xlim() and ylim() functions to change the view of the plotting regions. For example, let’s zoom in on the years 1900-2000 for the bottom three lines. The mortality rates in this region appear to range from 0 to 1.5.

gg + xlim(c(1900, 2000)) + ylim(c(0, 1.5))

We may also want to change the position and appearance of the text appearing in the titles or axes. In order to make these changes, we need to use the theme function (see ?theme for all this function can do). theme controls most of the look and feel of the plot. The arguments passed to theme components are required to be set using special element_type() functions. There are four major types.

element_text(): used to set text element attributes such as labels and titles
element_line(): used to modify line based components such as the axis lines, major and minor grid lines, etc.
element_rect(): modifies rectangle components such as plot and panel background
element_blank(): turns off the displaying theme.

Inside element_text we can set

size - adjusts size of the text
face - font face (“plain”, “italic”, “bold”, “bold.italic”)
family - font family
color - font color
hjust/vjust - horizontal/vertical justification (a number between 0 and 1)
lineheight - similar to size for text
angle - text rotation angle

gg + theme(plot.title = element_text(size = 20, 
                                     face = "bold", 
                                     family = "American Typewriter",
                                     color = "tomato",
                                     hjust = 0.5,
                                     lineheight = 1.2), 
           plot.subtitle = element_text(size = 15, 
                                         family = "American Typewriter",
                                         face = "bold",
                                         hjust = 0.5), 
                       axis.title.x = element_text(vjust = .5,  
                                        size = 15),  # X axis title
            axis.title.y = element_text(size = 15) )

## Plotting Characters, Line Types and Colors

ggplot(sweden_long, aes(x = year, y = morts)) +   geom_point(shape = 2)

R also has 7 different line types that can be chosen by the numbers 0 to 6 or by name (e.g. “blank”, “solid”, “dashed”, etc.). We set this by using the linetype aesthetic.

ggplot(sweden_long, aes(x = year, y = morts)) +   geom_line(linetype = 2)

gg + geom_point(aes(shape = country))

gg + geom_line(aes(linetype = country))

We can also modify the colors used in the plots by using the color aesthetic or the fill aesthetic depending on the plot type.

gg + geom_point(aes(col = country))

It’s actually pretty hard to make a good color palette. Luckily, smart and artistic people have spent a lot more time thinking about this. The result is the RColorBrewer package

install.packages("RColorBrewer")
library(RColorBrewer)

gg + geom_point(aes(color = country)) +   scale_color_brewer(type = "seq", palette = "Dark2")

### Modifying a Legend

concantenate string

paste()

## character(0)

paste0()

## character(0)

gg + geom_line(aes(color = country)) +
  scale_color_brewer(type = "seq", palette = "Dark2", name = "Country",
                     labels = paste("Country", 1:5))

 gg + geom_line(aes(color = country)) +
  scale_color_brewer(type = "seq", palette = "Dark2", name = "Country",
                     labels = paste("Country", 1:5)) + theme(legend.position = "bottom")

gg + geom_line(aes(color = country)) +
  scale_color_brewer(type = "seq", palette = "Dark2", name = "Country",
                     labels = paste("Country", 1:5)) + theme(legend.position= c(0.85,0.95))

## Drawing Mulitple Plots in a Single Figure

gg +   geom_point() +
  geom_line(aes(color = country)) +
  facet_wrap(~ country)

We can adjust the layout and number of rows and columns by using nrow and ncol in facet_wrap.

ggplot(sub, aes(x = year, y = morts)) +
  geom_point() +
  geom_line(aes(color = country)) +
  facet_wrap(~ country, nrow = 1)

ggplot2 provides ggsave to save plots in a number of formats, such as .png or .pdf. This function saves the last plot that you displayed.

gg

ggsave("./fitplot.png", width = 4, height = 4)

Some neat tricks

pipes: %>% The simple form of the forward pipe inserts the left-hand side as the first argument in the right-hand side call.

By default, the pipe passes the object on its left-hand side to the first argument of the function on the right-hand side. %>% allows you to change the placement with a . placeholder. For example, x %>% f(1) is equivalent to f(x, 1) but x %>% f(1, .) is equivalent to f(1, x).

Exercise

For these exercises, we will use the charm city circulator bus ridership dataset, Charm_City_Circulator_Ridership.csv. After modifying the path to the dataset on your computer, use the following code to read in and transform the dataset to be ready for use in plotting.

#library(lubridate)
#circ <- read.csv("./Charm_City_Circulator_Ridership.csv",check.names = FALSE)
#str(circ)
#circ <- mutate(circ,date = mdy(date))