This is a html document contains all the codes that are used in SAIG ggplot short course. Please feel free to copy and try the codes in the chunck on your own computer. The answers for exercises would be given at the end.
Install the package:
install.packages('ggplot2', dependencies = TRUE)
Load ggplot2 package
library(ggplot2)
In this short course, we will mainly work with two datasets: mtcars and diamonds. mtcars comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles. It is included in base packagedatasets. diamonds is included in ggplot2 packages, which contains information, such as color, carat and price, of over 5000 diamonds.
Get the dataset information:
?mtcars
?diamonds
Look at the data:
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
data argument takes the name of our data frame; In aes(), specify the variables you want to plot on the x-axis and y-axis respectively; geom_point() will give our scatter plot.
ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point()
To change size of the points:
# Scatterplot with Customizations
# Example 1: Change size of all points
ggplot(data=mtcars, aes(x=mpg,y=disp)) +
geom_point(size=3)
Change size based on the scale of another variable hp, gross horsepower.
ggplot(data=mtcars, aes(x=mpg,y=disp,size=hp)) +
geom_point()
We can add regression line through geom_smooth(), specify linear model as the method: method = 'lm'
# Scatterplot with Regression
ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point() +
geom_smooth(method="lm")
Change the color of confidence band:
## To match slide exactly
ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point() +
geom_smooth(method="lm", fill='yellow')
If you don’t want the confidence band:
## No confidence bands
ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
Draw a scatter plot first (the first two rows of codes are the same), then connect the adjacent points via geom_line().
ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point() +
geom_line()
Compare with instrinsic R plotting function:
In this case, if you want to connect the adjacent points, please sort the dataset before plotting. Here we use order function to do that.
## connect points that are adjacent in the dataset
par(mfcol = c(1,2))
plot(mtcars$mpg,mtcars$disp, pch=20, xlab='mpg',
ylab='engine size (disp)',las=1)
lines(mtcars$mpg,mtcars$disp)
## connect points that are adjacent in the x-axis of the scatter plot
plot(mtcars$mpg,mtcars$disp, pch=20, xlab='mpg',
ylab='engine size (disp)',las=1)
od = order(mtcars$mpg)
lines(mtcars$mpg[od],mtcars$disp[od])
Only need to specify x in aes(); geom_bar creates bars of stat that you specify. Here we want the bars to be cyl, which is the number of cylinders. Since bargraph are basically used for categorical variable, be sure to use as.factor function to convert cyl to categorical variable before plotting:
# Bargraph of cylinders (as categorical)
ggplot(data=mtcars,aes(x=as.factor(cyl))) +
geom_bar(stat='count')
If you want to use different colors to distinguish those three bars, you can change colors in aes() by adding color = as.factor(cyl).
The difference of color and fill argument is displayed below.
For bargraph: use fill
# Bargraphs-- Color vs. Fill
ggplot(data=mtcars,aes(x=as.factor(cyl), color=as.factor(cyl))) +
geom_bar(stat='count')
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat='count')
For scatter plot: use color
# Scatterplots-- Color vs. Fill
ggplot(data=mtcars,aes(x=mpg,y=disp,color=as.factor(cyl))) +
geom_point()
ggplot(data=mtcars,aes(x=mpg,y=disp,fill=as.factor(cyl))) +
geom_point()
If you don’t like the default color scheme, add the layer + scale_fill_brewer(palette = "name") to change.
# Example 3: Customizations with color-- R Color Brewer
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette="Set1")
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat='count') +
scale_fill_brewer(palette="Pastel2")
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat='count', width = 0.75) +
scale_fill_brewer(palette="Dark2") +
labs(x='Cylinder',fill='Cylinder')
ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat='count', width = 0.75) +
scale_fill_brewer(palette="Dark2") +
labs(x='Cylinder',fill='Cylinder') +
theme(legend.position="bottom")
Can you create the plots below on your own?
Basic boxplot:
ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) +
geom_boxplot()
Customize on colors and labels:
ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) +
geom_boxplot() +
scale_fill_brewer(palette="Set1") +
labs(x='Cylinder',fill='Cylinder',y='Miles per Gallon') +
theme(legend.position="bottom")
Add layer geom_histogram(), commonly-used nobs: binwidth bins(number of bins). You can supply a numeric vector giving the bin boundaries through breaks, which would override binwidth and bins if used.
# Histogram of mpg
ggplot(data=mtcars, aes(x=mpg)) +
geom_histogram(fill='gray', color='black', binwidth = 5)
# Compare with instrinsic version
hist(mtcars$mpg,nclass=5)
Histogram with density overlay:
Add geom_density layer.
ggplot(data=mtcars, aes(x=mpg)) +
geom_histogram(fill='gray', color='black', binwidth = 5, aes(y=..density..)) +
geom_density(alpha=0.1, fill="blue")
#### Empirical density curve Use
geom_density:
ggplot(data=mtcars, aes(x=mpg)) +
geom_density(alpha=0.1, fill="purple")
Add a verticle line:
ggplot(data=mtcars, aes(x=mpg)) +
geom_density(alpha=0.1, fill="purple") +
geom_vline(xintercept = mean(mtcars$mpg), color='blue', linetype='dashed')
Let’s check whether mpg is normally distributed or not. Looks like not normal since points are off the QQ line.
ggplot(data=mtcars, aes(sample=mpg)) +
stat_qq() +
stat_qq_line() +
labs(title='Normal QQPlot')
# Histogram
ggplot(data=df, aes(x=price)) +
geom_histogram(fill='white', color='blue', binwidth = 500, aes(y=..density..)) +
geom_density(alpha=0.2, fill="red") +
geom_vline(xintercept = mean(df$price), color='red', linetype='dashed')
library(gridExtra)
#++++++++++++++++++++++++++++++++++++++++++++++++++
# GGPLOT GRAPHIC ON SLIDE
#++++++++++++++++++++++++++++++++++++++++++++++++++
g1 <- ggplot(data=mtcars,aes(x=mpg,y=disp)) + geom_point()
g2 <- ggplot(data=mtcars,aes(x=mpg,y=disp)) +
geom_point() +
geom_line()
g3 <- ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg, fill=as.factor(cyl))) +
geom_boxplot() +
scale_fill_brewer(palette="Set1") +
labs(x='Cylinder',fill='Cylinder',y='Miles per Gallon') +
theme(legend.position="bottom")
g4 <- ggplot(data=mtcars,aes(x=as.factor(cyl), fill=as.factor(cyl))) +
geom_bar(stat='count', width = 0.75) +
scale_fill_brewer(palette="Dark2") +
labs(x='Cylinder',fill='Cylinder') +
theme(legend.position="bottom")
g5 <- ggplot(data=mtcars, aes(x=mpg)) +
geom_histogram(fill='gray', color='black', binwidth = 5)
g6 <- ggplot(data=mtcars, aes(sample=mpg)) +
stat_qq() +
labs(title='Normal QQPlot')
grid.arrange(g1,g2,g3,g4,g5,g6, ncol=3, nrow=2)
# Multiple Graphs
ggplot(data=mtcars, aes(x=mpg,y=disp,color=as.factor(cyl))) +
geom_point() +
facet_grid(cyl ~ .)
# Themes in ggplot
g <- ggplot(data=mtcars, aes(x=mpg,y=disp,size=as.factor(am),color=as.factor(cyl))) +
geom_point() +
labs(y='displacement',color='cylinders',size='trans type')
g
## Warning: Using size for a discrete variable is not advised.
g + theme_light()
## Warning: Using size for a discrete variable is not advised.
g + theme_dark()
## Warning: Using size for a discrete variable is not advised.
## Explore other themes
g + theme_bw()
## Warning: Using size for a discrete variable is not advised.
g + theme_minimal()
## Warning: Using size for a discrete variable is not advised.
g + theme_classic()
## Warning: Using size for a discrete variable is not advised.
To use melt function, you need reshape2 package.
library(reshape2)
# Step 1: Melt the dataframe to different structure
df <- mtcars
df$rownum <- 1:32
df2 <- melt(df[,c(1,3,4,12)], id.vars='rownum')
# Step 2: Make plot using new dataframe structure
ggplot(data=df2, aes(x=rownum, y=value, color=variable)) +
geom_point() +
geom_line() +
labs(x='Car Index',y='Value',color='Variable')
To run it with intrinsic R plot function:
matplot plots each column of a matrix/dataframe in one plot.
## matrix plot
matplot(df[,c(1,3,4)], type = "l", ylab = 'Value', xlab = 'Car Index')
legend('topright', c('mpg', 'disp', 'hp'),lty = 1:4, col = 1:4)
# Heatmap Intro Example
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
mat <- matrix(c(1,2,5,2,2,3,5,1,1),3,3,byrow=T)
x <- c('Row 1','Row 2', 'Row 3')
y <- c('Col 1','Col 2', 'Col 3')
data <- expand.grid(X=x, Y=y)
data$Z <- as.vector(mat)
ggplot(data=data, aes(x=Y,y=X,fill=Z)) +
geom_tile(color='white') +
scale_fill_gradient(low = "orange", high = "maroon") +
labs(x='',y='',fill='')
# Heatmap of Mtcars Data
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
df <- mtcars
df <- scale(df,center=TRUE,scale=TRUE) # standardize data to for visual purpose
df <- as.data.frame(df)
df$rowname <- row.names(mtcars)
df2 <- melt(df, id.vars='rowname') # need melted dataframe structure
ggplot(data=df2, aes(x=variable, y=rowname)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(low = "white", high = "black") +
labs(x='',y='',fill='Amount') +
theme_minimal()
This link gives a detailed tutorial on this: https://remiller1450.github.io/s230s19/Intro_maps.html
# install.packages('wordcloud', dependencies=TRUE)
library(wordcloud)
## Loading required package: RColorBrewer
# Load the doc by term summary
#words <- read.csv('Words_Freq.csv')
#words <- read.delim('TEX.txt')
words <- data.frame(words = c("CS", "STAT", "Math", 'Coding', "Debug", "ggplot"), freq = c(1,2,3,6,3,4))
# Word cloud
wordcloud(words = words$word, freq = words$freq, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0,
colors=brewer.pal(8, "Dark2"))
Setting the working directory is key here!!! Make sure you have the right folder setup because images will be saved there!
png('scatterplot22.png',width=3,height=2,units='in',res=300)
par(mar=c(4,4,1,1))
plot(mtcars$mpg, mtcars$disp, las=1, xlab='mpg', ylab='disp', pch=20)
dev.off()
# pdf('scatterplot.pdf',width=3,height=2.5)
par(mar=c(4,4,1,1))
plot(mtcars$mpg, mtcars$disp, las=1, xlab='mpg', ylab='disp', pch=20)
# dev.off()