This is a html document contains all the codes that are used in SAIG ggplot short course. Please feel free to copy and try the codes in the chunck on your own computer. The answers for exercises would be given at the end.

Set up for ggplot2 package

Install the package:

install.packages('ggplot2', dependencies = TRUE)

Load ggplot2 package

library(ggplot2)

Dataset

In this short course, we will mainly work with two datasets: mtcars and diamonds. mtcars comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles. It is included in base packagedatasets. diamonds is included in ggplot2 packages, which contains information, such as color, carat and price, of over 5000 diamonds.

Get the dataset information:

?mtcars
?diamonds

Look at the data:

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

Scatter plot

data argument takes the name of our data frame; In aes(), specify the variables you want to plot on the x-axis and y-axis respectively; geom_point() will give our scatter plot.

ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point()

To change size of the points:

# Scatterplot with Customizations
# Example 1: Change size of all points
ggplot(data=mtcars, aes(x=mpg,y=disp)) +
  geom_point(size=3) 

Change size based on the scale of another variable hp, gross horsepower.

ggplot(data=mtcars, aes(x=mpg,y=disp,size=hp)) +
  geom_point()

Scatter plot for regression

We can add regression line through geom_smooth(), specify linear model as the method: method = 'lm'

# Scatterplot with Regression
ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point() + 
  geom_smooth(method="lm")

Change the color of confidence band:

## To match slide exactly
ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point() + 
  geom_smooth(method="lm", fill='yellow')

If you don’t want the confidence band:

## No confidence bands
ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point() + 
  geom_smooth(method="lm", se=FALSE)

Linegraph

Draw a scatter plot first (the first two rows of codes are the same), then connect the adjacent points via geom_line().

ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point() +
  geom_line()

Compare with instrinsic R plotting function:

In this case, if you want to connect the adjacent points, please sort the dataset before plotting. Here we use order function to do that.

## connect points that are adjacent in the dataset
par(mfcol = c(1,2))
plot(mtcars$mpg,mtcars$disp, pch=20, xlab='mpg',
     ylab='engine size (disp)',las=1)
lines(mtcars$mpg,mtcars$disp)

## connect points that are adjacent in the x-axis of the scatter plot
plot(mtcars$mpg,mtcars$disp, pch=20, xlab='mpg',
     ylab='engine size (disp)',las=1)
od = order(mtcars$mpg)
lines(mtcars$mpg[od],mtcars$disp[od])

Bargraph

Only need to specify x in aes(); geom_bar creates bars of stat that you specify. Here we want the bars to be cyl, which is the number of cylinders. Since bargraph are basically used for categorical variable, be sure to use as.factor function to convert cyl to categorical variable before plotting:

# Bargraph of cylinders (as categorical)
ggplot(data=mtcars,aes(x=as.factor(cyl))) + 
  geom_bar(stat='count')

Customization via color:

If you want to use different colors to distinguish those three bars, you can change colors in aes() by adding color = as.factor(cyl).

The difference of color and fill argument is displayed below.

For bargraph: use fill

# Bargraphs-- Color vs. Fill
ggplot(data=mtcars,aes(x=as.factor(cyl), color=as.factor(cyl))) + 
  geom_bar(stat='count') 

ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat='count') 

For scatter plot: use color

# Scatterplots-- Color vs. Fill
ggplot(data=mtcars,aes(x=mpg,y=disp,color=as.factor(cyl))) + 
  geom_point() 

ggplot(data=mtcars,aes(x=mpg,y=disp,fill=as.factor(cyl))) + 
  geom_point() 

If you don’t like the default color scheme, add the layer + scale_fill_brewer(palette = "name") to change.

# Example 3: Customizations with color-- R Color Brewer
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat = 'count') + 
  scale_fill_brewer(palette="Set1") 

ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat='count') + 
  scale_fill_brewer(palette="Pastel2") 

Customize labels and legends

ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat='count', width = 0.75) +
  scale_fill_brewer(palette="Dark2") + 
  labs(x='Cylinder',fill='Cylinder') 

ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat='count', width = 0.75) +
  scale_fill_brewer(palette="Dark2") + 
  labs(x='Cylinder',fill='Cylinder') + 
  theme(legend.position="bottom")

Exercise with Diamond dataset

Can you create the plots below on your own?

Boxplot

Basic boxplot:

ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) + 
  geom_boxplot()

Customize on colors and labels:

ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) + 
  geom_boxplot() +
  scale_fill_brewer(palette="Set1") +
  labs(x='Cylinder',fill='Cylinder',y='Miles per Gallon') +
  theme(legend.position="bottom")

Histogram

Add layer geom_histogram(), commonly-used nobs: binwidth bins(number of bins). You can supply a numeric vector giving the bin boundaries through breaks, which would override binwidth and bins if used.

# Histogram of mpg
ggplot(data=mtcars, aes(x=mpg)) +
  geom_histogram(fill='gray', color='black', binwidth = 5)

# Compare with instrinsic version
hist(mtcars$mpg,nclass=5)

Histogram with density overlay:

Add geom_density layer.

ggplot(data=mtcars, aes(x=mpg)) + 
  geom_histogram(fill='gray', color='black', binwidth = 5, aes(y=..density..)) +  
  geom_density(alpha=0.1, fill="blue") 

#### Empirical density curve Use geom_density:

ggplot(data=mtcars, aes(x=mpg)) + 
  geom_density(alpha=0.1, fill="purple")

Add a verticle line:

ggplot(data=mtcars, aes(x=mpg)) + 
  geom_density(alpha=0.1, fill="purple") +
  geom_vline(xintercept = mean(mtcars$mpg), color='blue', linetype='dashed')

QQ plot

Let’s check whether mpg is normally distributed or not. Looks like not normal since points are off the QQ line.

ggplot(data=mtcars, aes(sample=mpg)) + 
  stat_qq() + 
  stat_qq_line() +
  labs(title='Normal QQPlot')

Exercise Boxplot and Histogram

# Histogram
ggplot(data=df, aes(x=price)) +
  geom_histogram(fill='white', color='blue', binwidth = 500, aes(y=..density..)) +
  geom_density(alpha=0.2, fill="red") +
  geom_vline(xintercept = mean(df$price), color='red', linetype='dashed')

Multiple panels in one plot

library(gridExtra)
#++++++++++++++++++++++++++++++++++++++++++++++++++
# GGPLOT GRAPHIC ON SLIDE
#++++++++++++++++++++++++++++++++++++++++++++++++++
g1 <- ggplot(data=mtcars,aes(x=mpg,y=disp)) + geom_point()
g2 <- ggplot(data=mtcars,aes(x=mpg,y=disp)) + 
  geom_point() +
  geom_line()
g3 <- ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) + 
  geom_boxplot() + 
  scale_fill_brewer(palette="Set1") +
  labs(x='Cylinder',fill='Cylinder',y='Miles per Gallon') +
  theme(legend.position="bottom")
g4 <- ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) + 
  geom_bar(stat='count', width = 0.75) +
  scale_fill_brewer(palette="Dark2") + 
  labs(x='Cylinder',fill='Cylinder') + 
  theme(legend.position="bottom")
g5 <- ggplot(data=mtcars, aes(x=mpg)) +
  geom_histogram(fill='gray', color='black', binwidth = 5)
g6 <- ggplot(data=mtcars, aes(sample=mpg)) + 
  stat_qq() + 
  labs(title='Normal QQPlot')

grid.arrange(g1,g2,g3,g4,g5,g6, ncol=3, nrow=2)

Show panels on different levels of one variable in one plot

# Multiple Graphs
ggplot(data=mtcars, aes(x=mpg,y=disp,color=as.factor(cyl))) +
  geom_point() + 
  facet_grid(cyl ~ .)

Change theme

# Themes in ggplot
g <- ggplot(data=mtcars, aes(x=mpg,y=disp,size=as.factor(am),color=as.factor(cyl))) +
  geom_point() +
  labs(y='displacement',color='cylinders',size='trans type')
g
## Warning: Using size for a discrete variable is not advised.

g + theme_light()
## Warning: Using size for a discrete variable is not advised.

g + theme_dark()
## Warning: Using size for a discrete variable is not advised.

## Explore other themes
g + theme_bw()
## Warning: Using size for a discrete variable is not advised.

g + theme_minimal()
## Warning: Using size for a discrete variable is not advised.

g + theme_classic()
## Warning: Using size for a discrete variable is not advised.

Multiple Lines in Linegraph and Data structure Example

To use melt function, you need reshape2 package.

library(reshape2)
# Step 1: Melt the dataframe to different structure
df <- mtcars
df$rownum <- 1:32
df2 <- melt(df[,c(1,3,4,12)], id.vars='rownum')

# Step 2: Make plot using new dataframe structure
ggplot(data=df2, aes(x=rownum, y=value, color=variable)) + 
  geom_point() + 
  geom_line() +
  labs(x='Car Index',y='Value',color='Variable')

To run it with intrinsic R plot function:

matplot plots each column of a matrix/dataframe in one plot.

## matrix plot
matplot(df[,c(1,3,4)], type = "l", ylab = 'Value', xlab = 'Car Index')
legend('topright', c('mpg', 'disp', 'hp'),lty = 1:4, col = 1:4)

Heatmap

# Heatmap Intro Example
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
mat <- matrix(c(1,2,5,2,2,3,5,1,1),3,3,byrow=T)
x <- c('Row 1','Row 2', 'Row 3')
y <- c('Col 1','Col 2', 'Col 3')
data <- expand.grid(X=x, Y=y)
data$Z <- as.vector(mat)

ggplot(data=data, aes(x=Y,y=X,fill=Z)) + 
  geom_tile(color='white') +
  scale_fill_gradient(low = "orange", high = "maroon") +
  labs(x='',y='',fill='')

# Heatmap of Mtcars Data
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
df <- mtcars
df <- scale(df,center=TRUE,scale=TRUE) # standardize data to for visual purpose
df <- as.data.frame(df)
df$rowname <- row.names(mtcars)
df2 <- melt(df, id.vars='rowname') # need melted dataframe structure

ggplot(data=df2, aes(x=variable, y=rowname)) + 
  geom_tile(aes(fill = value), colour = "white") + 
  scale_fill_gradient(low = "white", high = "black") +
  labs(x='',y='',fill='Amount') +
  theme_minimal()

Spatial map

This link gives a detailed tutorial on this: https://remiller1450.github.io/s230s19/Intro_maps.html

Word Cloud

# install.packages('wordcloud', dependencies=TRUE)
library(wordcloud)
## Loading required package: RColorBrewer
# Load the doc by term summary
#words <- read.csv('Words_Freq.csv')
#words <- read.delim('TEX.txt')
words <- data.frame(words = c("CS", "STAT", "Math", 'Coding', "Debug", "ggplot"), freq = c(1,2,3,6,3,4))
# Word cloud
wordcloud(words = words$word, freq = words$freq, min.freq = 2,
          max.words=200, random.order=FALSE, rot.per=0, 
          colors=brewer.pal(8, "Dark2"))

Export and Save Images

Setting the working directory is key here!!! Make sure you have the right folder setup because images will be saved there!

png('scatterplot22.png',width=3,height=2,units='in',res=300)
par(mar=c(4,4,1,1))
plot(mtcars$mpg, mtcars$disp, las=1, xlab='mpg', ylab='disp', pch=20)
dev.off()

# pdf('scatterplot.pdf',width=3,height=2.5)
par(mar=c(4,4,1,1))
plot(mtcars$mpg, mtcars$disp, las=1, xlab='mpg', ylab='disp', pch=20)
# dev.off()