Data Viz: Basic Graphics

I: Basic Graphics

We will be doing simplified reproductions of figures found in Storytelling with Data. When we learn more about ggplot we can personalize and polish our plots more.

1) Heatmapped Table

To create a heatmapped table you might want to use the following packages: ztable or tidyverse.

I had a lot of difficult with installing ztable. So I will recreate heatmapped tables using the tiling feature of ggplot2.
#### Load the example data

bars<-matrix(c(4, 3, 1, 1,
               5, 6, 3, 1,
               4, 5, 1, 2,
               4, 5, 3, 2,
               7, 6, 5, 3), nrow=5, byrow=TRUE)
rownames(bars)<-c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5")
colnames(bars)<-c("A", "B", "C", "D")

bars

##      A B C D
## Cat1 4 3 1 1
## Cat2 5 6 3 1
## Cat3 4 5 1 2
## Cat4 4 5 3 2
## Cat5 7 6 5 3

This data is NOT tidy

I use the melt function to establish observations are row and variables as columns.

#install.packages("reshape2")
library(reshape2)

## Warning: package 'reshape2' was built under R version 3.6.2

barsM<-melt(bars)

# Now look at the data
head(barsM)

##   Var1 Var2 value
## 1 Cat1    A     4
## 2 Cat2    A     5
## 3 Cat3    A     4
## 4 Cat4    A     4
## 5 Cat5    A     7
## 6 Cat1    B     3

Now create a tiled plot

library(tidyverse)
ggplot(barsM, aes(Var1, Var2)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
  geom_text(aes(fill = value, label = round(value, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")

Here is another example from the textbook

## FIG 2.5
heat<-read.delim("/Users/hsmalley/Downloads/swdFig2.5.txt",
                 header = TRUE)

rownames(heat)<-heat[,1]
heat<-heat[,-1]
headM<-melt(as.matrix(heat))

# Relevel categories in reverse
headM$Var1<-ordered(headM$Var1, levels = rev(levels(headM$Var1)))

ggplot(headM, aes(Var1, Var2)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
  geom_text(aes(label = round(value, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")+
  coord_flip()

## NOTE: You can also use the gather function in tidyr to do the manipulation
heat<-read.delim("/Users/hsmalley/Downloads/swdFig2.5.txt",
                 header = TRUE)
headG<-heat%>%
  gather("type", "percent", -X)

2) Simple Text

Here is a simple example where text would suffice:

## RECREATE FIG2.2
mothers<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0202-3.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# geom_col used when counts instead of indiv obs
ggplot(mothers, aes(x=as.factor(Year), y=Value))+
  geom_col()+
  xlab("Year")+
  ylab("Percent")+
  theme_minimal()+
  ggtitle("Pecent Stay-at-Home")

3) Scatter Plot

Import the data

## RECREATE FIG2.6 and FIG2.7
drive<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0206-7.csv",
               header=TRUE, 
               stringsAsFactors = FALSE)

str(drive)

## 'data.frame':    36 obs. of  2 variables:
##  $ Miles.Driven : chr  "1,100" "1,177" "1,239" "1,294" ...
##  $ Cost.Per.Mile: chr  "$2.40" "$2.80" "$2.20" "$2.50" ...

Remove Currency Symbols

drive$Cost.Per.Mile2<-as.numeric(gsub("[\\$,]", "", drive$Cost.Per.Mile))
drive$Miles.Driven2<-as.numeric(gsub("[\\$,]", "", drive$Miles.Driven))

str(drive)

## 'data.frame':    36 obs. of  4 variables:
##  $ Miles.Driven  : chr  "1,100" "1,177" "1,239" "1,294" ...
##  $ Cost.Per.Mile : chr  "$2.40" "$2.80" "$2.20" "$2.50" ...
##  $ Cost.Per.Mile2: num  2.4 2.8 2.2 2.5 1.9 2 2.2 1.35 2 1.3 ...
##  $ Miles.Driven2 : num  1100 1177 1239 1294 1378 ...

Create above average variable

above<-drive%>%
  mutate(above=(Cost.Per.Mile2>=mean(Cost.Per.Mile2)))

Plot your data

ggplot(above, aes(Miles.Driven2, Cost.Per.Mile2, color=above))+
  geom_point()+
  geom_hline(yintercept=1.5, linetype="dashed", size=1)+
  geom_point(aes(x=mean(Miles.Driven2), y=mean(Cost.Per.Mile2)),color="black", size=5)+
  xlab("Miles driven per month")+
  ylab("Cost per mile")+
  theme_minimal()+
  theme(legend.position = "none")+
  ggtitle("Scatterplot: Miles Driven vs Cost Per Mile")

4) Line Graph

In this example, we need to use a date type variable. This can be done with the package lubridate.

## RECREATE FIG2.8
#install.packages("lubridate")
library(lubridate)

## Warning: package 'lubridate' was built under R version 3.6.2

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

lineEx<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0209.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

lineDate<-lineEx%>% 
  mutate(date = ymd(paste(Year, Month, 1)))

ggplot(lineDate, aes(date, Avg))+
  geom_line()+
  theme_minimal()

5) Slope Graph

Warning: These data are not tidy. Use the gather function!

## RECREATE FIG2.10

slope<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0210-11.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# needs some transformation
colnames(slope)<-c("Item", 2014, 2015)

slopeT<-slope%>%
  gather("year", "percent", -Item)

slopeT$percent2<-as.numeric(gsub("[\\%]", "", slopeT$percent))/100

ggplot(slopeT, aes(year, percent2, group=Item))+
  geom_line()+
  theme_minimal()

6) Bar Graphs

Let’s create some fake data for this:

# This is how the book does it
bars<-matrix(c(4, 3, 1, 1,
               5, 6, 3, 1,
               4, 5, 1, 2,
               4, 5, 3, 2,
               7, 6, 5, 3), nrow=5, byrow=TRUE)

bars<-cbind(c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5"), 
            bars)

colnames(bars)<-c("Category","A", "B", "C", "D")

# But we need to make it tidy
barsT<-as.data.frame(bars)%>%
  gather("Letter", "Count", 2:5)

## Warning: attributes are not identical across measure variables;
## they will be dropped

barsT$Count<-as.numeric(barsT$Count)

There are many different kinds of bar graphs

A. Vanilla (Vertical)

ggplot(barsT, aes(x=Category, y = Count)) + 
  geom_bar(stat="identity")

B. Vertical Stacked Bar Chart

ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity")

B.2 Vertical Stacked Bar Chart as Percentages

ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity", position="fill")

C. Vertical Side-by-side Bar Chart

ggplot(barsT, aes(x=Category, y = Count, fill=Letter)) + 
  geom_bar(stat="identity", position="dodge")

D. Horizontal Bar Chart

Any of the plots above can be made horizontal!

ggplot(barsT, aes(x=Category, y = Count)) + 
  geom_bar(stat="identity")+
  coord_flip()

II: Esquisse

Since we are learning how to make graphics for the first time. I suggest using this helpful tool!

#install.packages("esquisse")
library(esquisse)

Please load the data for the examples that I have pre-processed

mothers

## MOTHER EXAMPLE
mothers<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0202-3.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

drive

## MILES DRIVEN
drive<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0206-7.csv",
               header=TRUE, 
               stringsAsFactors = FALSE)
drive$Cost.Per.Mile2<-as.numeric(gsub("[\\$,]", "", drive$Cost.Per.Mile))
drive$Miles.Driven2<-as.numeric(gsub("[\\$,]", "", drive$Miles.Driven))

above<-drive%>%
  mutate(above=(Cost.Per.Mile2>=mean(Cost.Per.Mile2)))

lineDate

## PASSPORT WAIT
#install.packages("lubridate")
library(lubridate)

lineEx<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0209.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

lineDate<-lineEx%>% 
  mutate(date = ymd(paste(Year, Month, 1)))

slopeT

## EMPLOYEE FEEDBACK
slope<-read.csv("https://raw.githubusercontent.com/adamribaudo/storytelling-with-data-ggplot/master/data/FIG0210-11.csv",
                header=TRUE, 
                stringsAsFactors = FALSE)

# needs some transformation
colnames(slope)<-c("Item", 2014, 2015)

slopeT<-slope%>%
  gather("year", "percent", -Item)

slopeT$percent2<-as.numeric(gsub("[\\%]", "", slopeT$percent))/100

III: Let’s Practice

### MEALS
meals<-read.delim("/Users/hsmalley/Downloads/meals.txt",
                 header = TRUE)
str(meals)

## 'data.frame':    10 obs. of  2 variables:
##  $ Campaign.Year: int  2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
##  $ Meals.Served : int  40139 127020 168193 153115 202102 232897 277912 205350 233389 232797

ggplot(meals, aes(y=Campaign.Year, x=1)) + # x and y axes
  geom_tile(aes(fill = Meals.Served)) + # background colours are mapped according to the value column
  geom_text(aes(label = round(Meals.Served, 2))) +
  scale_fill_gradient2(low = "white", 
                       high = "forestgreen")+
  theme_minimal()

ggplot(meals, aes(x=Campaign.Year, y=Meals.Served))+
  geom_bar(stat="identity")+
  theme_minimal()

ggplot(meals, aes(x=Campaign.Year, y=Meals.Served))+
  geom_line()+
  theme_minimal()