I. Basic Graphics

We will be doing simplified reproductions of figures found in Storytelling with Data. When we learn more about ggplot we can personalize and polish our plots more.

1) Heatmapped Table

To create a heatmapped table you might want to use the following packages: ztable or tidyverse.

I had a lot of difficult with installing ztable. So I will recreate heatmapped tables using the tiling feature of ggplot2.

Load the example data
# Create a matrix 
bars<-matrix(c(4, 3, 1, 1,
               5, 6, 3, 1,
               4, 5, 1, 2,
               4, 5, 3, 2,
               7, 6, 5, 3), nrow=5, byrow=TRUE)

# label rows and columns
rownames(bars)<-c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5")
colnames(bars)<-c("A", "B", "C", "D")

bars
##      A B C D
## Cat1 4 3 1 1
## Cat2 5 6 3 1
## Cat3 4 5 1 2
## Cat4 4 5 3 2
## Cat5 7 6 5 3
This data is NOT tidy

I use the melt function to establish observations are row and variables as columns. You can also use gather.

#install.packages("reshape2")
library(reshape2)
barsM<-melt(bars)

# Now look at the data
head(barsM)
##   Var1 Var2 value
## 1 Cat1    A     4
## 2 Cat2    A     5
## 3 Cat3    A     4
## 4 Cat4    A     4
## 5 Cat5    A     7
## 6 Cat1    B     3
Now create a tiled plot
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.1.1     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
ggplot(barsM, aes(Var1, Var2)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
  geom_text(aes(fill = value, label = round(value, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")
## Warning: Ignoring unknown aesthetics: fill

##### Here is another example from the textbook

## FIG 2.5 (Data on github)
heat<-read.delim("https://raw.githubusercontent.com/kitadasmalley/DATA502/main/FALL2022/Data/swdFig2.5.txt",
                 header = TRUE)

rownames(heat)<-heat[,1]
heat<-heat[,-1]
headM<-melt(as.matrix(heat))

# Relevel categories in reverse
headM$Var1<-ordered(headM$Var1, levels = rev(levels(headM$Var1)))

ggplot(headM, aes(Var1, Var2)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
  geom_text(aes(label = round(value, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")+
  coord_flip()

## NOTE: You can also use the gather function in tidyr to do the manipulation
heat<-read.delim("https://raw.githubusercontent.com/kitadasmalley/DATA502/main/FALL2022/Data/swdFig2.5.txt",
                 header = TRUE)
headG<-heat%>%
  gather("type", "percent", -X)

2) Simple Text

Here is a simple example where text would suffice:

## RECREATE FIG2.2 (DATA ON GITHUB)
mothers<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0202-3.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# geom_col used when counts instead of indiv obs
ggplot(mothers, aes(x=as.factor(Year), y=Value))+
  geom_col()+
  xlab("Year")+
  ylab("Percent")+
  theme_minimal()+
  ggtitle("Pecent Stay-at-Home")

3) Scatter Plot

Import the data
## RECREATE FIG2.6 and FIG2.7
## DATA ON GITHUB
drive<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0206-7.csv",
               header=TRUE, 
               stringsAsFactors = FALSE)

## USE str to see what data are available
str(drive)
## 'data.frame':    36 obs. of  2 variables:
##  $ Miles.Driven : chr  "1,100" "1,177" "1,239" "1,294" ...
##  $ Cost.Per.Mile: chr  "$2.40" "$2.80" "$2.20" "$2.50" ...
Remove Currency Symbols

When R sees non-numeric characters anyway in the column it will assume the column is composed of strings. We use gsub as a way to find and replace these characters.

drive$Cost.Per.Mile2<-as.numeric(gsub("[\\$,]", "", drive$Cost.Per.Mile))
drive$Miles.Driven2<-as.numeric(gsub("[\\$,]", "", drive$Miles.Driven))

str(drive)
## 'data.frame':    36 obs. of  4 variables:
##  $ Miles.Driven  : chr  "1,100" "1,177" "1,239" "1,294" ...
##  $ Cost.Per.Mile : chr  "$2.40" "$2.80" "$2.20" "$2.50" ...
##  $ Cost.Per.Mile2: num  2.4 2.8 2.2 2.5 1.9 2 2.2 1.35 2 1.3 ...
##  $ Miles.Driven2 : num  1100 1177 1239 1294 1378 ...
Create above average variable
above<-drive%>%
  mutate(above=(Cost.Per.Mile2>=mean(Cost.Per.Mile2)))
Plot your data
ggplot(above, aes(Miles.Driven2, Cost.Per.Mile2, color=above))+
  geom_point()+
  geom_hline(yintercept=1.5, linetype="dashed", size=1)+
  geom_point(aes(x=mean(Miles.Driven2), y=mean(Cost.Per.Mile2)),color="black", size=5)+
  xlab("Miles driven per month")+
  ylab("Cost per mile")+
  theme_minimal()+
  theme(legend.position = "none")+
  ggtitle("Scatterplot: Miles Driven vs Cost Per Mile")

4) Line Graph

In this example, we need to use a date type variable. This can be done with the package lubridate.

## RECREATE FIG2.8
#install.packages("lubridate")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
lineEx<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0209.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

lineDate<-lineEx%>% 
  mutate(date = ymd(paste(Year, Month, 1)))

ggplot(lineDate, aes(date, Avg))+
  geom_line()+
  theme_minimal()

5) Slope Graph

Warning: These data are not tidy. Use the gather function!

## RECREATE FIG2.10

slope<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0210-11.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# needs some transformation
colnames(slope)<-c("Item", 2014, 2015)

slopeT<-slope%>%
  gather("year", "percent", -Item)

slopeT$percent2<-as.numeric(gsub("[\\%]", "", slopeT$percent))/100

ggplot(slopeT, aes(year, percent2, group=Item))+
  geom_line()+
  theme_minimal()

6) Bar Graphs

Let’s create some fake data for this:

# This is how the book does it
bars<-matrix(c(4, 3, 1, 1,
               5, 6, 3, 1,
               4, 5, 1, 2,
               4, 5, 3, 2,
               7, 6, 5, 3), nrow=5, byrow=TRUE)

bars<-cbind(c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5"), 
            bars)

colnames(bars)<-c("Category","A", "B", "C", "D")

# But we need to make it tidy
barsT<-as.data.frame(bars)%>%
  gather("Letter", "Count", 2:5)

barsT$Count<-as.numeric(barsT$Count)

There are many different kinds of bar graphs:

A. Vanilla (Vertical)
ggplot(barsT, aes(x=Category, y = Count)) + 
  geom_bar(stat="identity")

B. Vertical Stacked Bar Chart
ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity")

B2. Vertical Stacked Bar Chart as Percentages
ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity", position="fill")

C. Vertical Side-by-side Bar Chart
ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity", position="dodge")

D. Horizontal Bar Chart

Any of the plots above can be made horizontal!

ggplot(barsT, aes(x=Category, y = Count)) + 
  geom_bar(stat="identity")+
  coord_flip()

II. Esquisse

Since we are learning how to make graphics for the first time. I suggest using this helpful tool!

#install.packages("esquisse")
library(esquisse)

Please load the data for the examples that I have pre-processed

A. Mothers
## MOTHER EXAMPLE
mothers<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0202-3.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)
B. Drive
## MILES DRIVEN
drive<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0206-7.csv",
               header=TRUE, 
               stringsAsFactors = FALSE)
drive$Cost.Per.Mile2<-as.numeric(gsub("[\\$,]", "", drive$Cost.Per.Mile))
drive$Miles.Driven2<-as.numeric(gsub("[\\$,]", "", drive$Miles.Driven))

above<-drive%>%
  mutate(above=(Cost.Per.Mile2>=mean(Cost.Per.Mile2)))
C. Line Date
## PASSPORT WAIT
#install.packages("lubridate")
library(lubridate)

lineEx<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0209.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

lineDate<-lineEx%>% 
  mutate(date = ymd(paste(Year, Month, 1)))
D. Slope T
## EMPLOYEE FEEDBACK
slope<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0210-11.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# needs some transformation
colnames(slope)<-c("Item", 2014, 2015)

slopeT<-slope%>%
  gather("year", "percent", -Item)

slopeT$percent2<-as.numeric(gsub("[\\%]", "", slopeT$percent))/100

III: Let’s Practice

### MEALS
meals<-read.delim("https://raw.githubusercontent.com/kitadasmalley/DATA502/main/FALL2022/Data/ch2_meals.txt",
                 header = TRUE)
str(meals)
## 'data.frame':    10 obs. of  2 variables:
##  $ Campaign.Year: int  2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
##  $ Meals.Served : int  40139 127020 168193 153115 202102 232897 277912 205350 233389 232797
### HEATMAP
ggplot(meals, aes(y=Campaign.Year, x=1)) + # x and y axes
  geom_tile(aes(fill = Meals.Served)) + # background colours are mapped according to the value column
  geom_text(aes(label = round(Meals.Served, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")+
  theme_minimal()

### BAR GRAPH
ggplot(meals, aes(x=Campaign.Year, y=Meals.Served))+
  geom_bar(stat="identity")+
  theme_minimal()

### LINE PLOT
ggplot(meals, aes(x=Campaign.Year, y=Meals.Served))+
  geom_line()+
  theme_minimal()