ggplot2 is Hadley Wickham’s R package for producing “elegant graphics for data analysis”
The gg in ggplot2 stands for “Grammar of Graphics”
The concept behind ggplot2 divides plot into three different fundamental parts: Plot = Data + Aesthetics + Geometry.
An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the shape, or the color of your points. You can display a point (like the one below) in different ways by changing the values of its aesthetic properties.
There are two major functions in ggplot2: qplot() and ggplot() - qplot() stands for quick plot, which can be used to produce easily simple plots. - ggplot() function is more flexible and robust than qplot for building a plot piece by piece and will be our focus
The seven ingredients of a ggplot
#ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(
# mapping = aes(<MAPPINGS>),
# stat = <STAT>,
# position = <POSITION>
# ) +
# <COORDINATE_FUNCTION> +
# <FACET_FUNCTION>
if (!require("pacman")) install.packages("pacman")
pacman::p_load(ggplot2, gridExtra)
Let’s clean the global environment before moving further
rm(list=ls())
cat("\014")
A major disadvantage of ggplot2 is that the data can only be a data.frame
Let’s load the dataset we’ve already worked with in the past
#Load the dataset
wine<-read.csv("~/Downloads/WINE.csv")
wine$QUALITY<-as.factor(wine$QUALITY)
head(wine)
## PH ALCOHOL QUALITY COUNTRY PRICE RATE YEAR
## 1 3.57 10.2 5 Italy 145.65 93 1982
## 2 3.20 9.8 5 Argentina 17.12 53 2000
## 3 3.42 11.0 6 Kazahstan 63.31 8 2003
## 4 3.52 11.2 6 Argentina 89.87 82 2002
## 5 3.45 10.5 5 Argentina 128.45 93 2007
## 6 3.51 9.4 5 Kazahstan 20.38 66 1973
# Basic barplot
p<-ggplot(data=wine, aes(x=QUALITY, y=PRICE)) +theme_minimal()+
geom_bar(stat="identity", fill = wine$QUALITY)
q<-p+coord_flip() #horizontal barplot
#Let's change the width of the bar
r<-ggplot(data=wine, aes(x=QUALITY, y=PRICE)) +theme_minimal()+
geom_bar(stat="identity", fill = wine$QUALITY, width = 0.4)
s<- ggplot(data=wine, aes(x=QUALITY, y=PRICE)) +theme_minimal()+
geom_bar(stat="identity")+
scale_x_discrete(limits=c(5,8))#Let's look at only the Quality 5,8
grid.arrange(p,q,r,s, nrow=2)
It is also possible to change manually barplot fill colors using the functions :
scale_fill_manual() : to use custom colors scale_fill_brewer() : to use color palettes from RColorBrewer package scale_fill_grey() : to use grey color palettes
# Use custom color palettes
a<-ggplot(data=wine, aes(x=QUALITY, y=PRICE, fill = QUALITY))+
geom_bar(stat="identity")+
theme_minimal()+
theme(legend.position = "top")+ #Change legend position
scale_fill_manual(values=c("red","orange","blue","thistle","cyan","green"))
# use brewer color palettes
b<-ggplot(data=wine, aes(x=QUALITY, y=PRICE, fill = QUALITY))+
geom_bar(stat="identity")+
theme_minimal()+
theme(legend.position = "bottom")+ #Change legend position
scale_fill_brewer(palette="Dark2")
# Use grey scale
c<-ggplot(data=wine, aes(x=QUALITY, y=PRICE, fill = QUALITY))+
geom_bar(stat="identity")+
theme_minimal()+
scale_fill_grey()
grid.arrange(a,b,c, nrow = 2)
It is created by default. You can use the function position_dodge() to change this.
wine$COUNTRY<-as.factor(wine$COUNTRY)
a<-ggplot(data=wine, aes(x=QUALITY, y=PRICE, fill = COUNTRY))+
geom_bar(stat="identity")+
theme_light()
# use brewer color palettes
b<-ggplot(data=wine, aes(x=QUALITY, y=PRICE, fill = COUNTRY))+
geom_bar(stat="identity", position = position_dodge())+
theme_minimal()+
scale_fill_brewer(palette="Dark2")
grid.arrange(a,b, nrow=2)
The function geom_boxplot() is used
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=FALSE)
## geom_boxplot: outlier.colour = black, outlier.fill = NULL, outlier.shape = 16, outlier.size = 2, outlier.stroke = 0.5, outlier.alpha = NULL, notch = FALSE, notchwidth = 0.5, varwidth = FALSE, na.rm = FALSE, orientation = NA
## stat_boxplot: na.rm = FALSE, orientation = NA
## position_dodge2
Details
outlier.colour, outlier.shape, outlier.size : The color, the shape and the size for outlying points notch : logical value. If TRUE, make a notched box plot. The notch displays a confidence interval around the median which is normally based on the median +/- 1.58*IQR/sqrt(n). Notches are used to compare groups; if the notches of two boxes do not overlap, this is a strong evidence that the medians differ.
Basic box plot
p <- ggplot(wine, aes(x=QUALITY, y=PRICE)) +
geom_boxplot()
# Rotate the box plot
b<-p + coord_flip()
# Notched box plot
c<-ggplot(wine, aes(x=QUALITY, y=PRICE)) +
geom_boxplot(notch=TRUE)
# Change outlier, color, shape and size
d<-ggplot(wine, aes(x=QUALITY, y=PRICE)) +
geom_boxplot(outlier.colour="red",outlier.shape=8,outlier.size=4)
grid.arrange(p,b,c,d)
The function stat_summary() can be used to add mean points to a box plot :
# Box plot with mean points
a<-ggplot(wine, aes(x=QUALITY, y=PRICE, fill = QUALITY)) +
geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=23, size=4)
a
For histogram we will be using the inbuilt mtcars dataset
data(mtcars)
cars<-mtcars
head(cars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
We will use geom_histogram function
geom_histogram(mapping = NULL, data = NULL, binwidth = 0.5)
## geom_bar: na.rm = FALSE, orientation = NA
## stat_bin: binwidth = 0.5, bins = NULL, na.rm = FALSE, orientation = NA, pad = FALSE
## position_stack
gp0 <- ggplot(data = cars,
aes(x = mpg))+
geom_histogram(
aes(y=..density..),
fill = "thistle",
binwidth = 0.5)+
theme_minimal()
gp1 <- ggplot(data = cars,
aes(x = mpg))+
geom_histogram(
aes(y=..density..),
colour = "black", #color of the border
fill = "thistle",#color inside the bin
binwidth = 1) +
geom_density(alpha=.2)+ #Add density curve
theme_minimal()
#Overlay with density plot. The value of alpha controls the level of transparency
gp2 <- ggplot(data = cars,
aes(x = mpg, fill= as.factor(cyl)))+
geom_histogram(
aes(y=..density..),
binwidth = 1) +
geom_density(alpha=.2) #Add density curve
gp2<-gp2+ scale_fill_discrete(name = "No. of Cylinders")+ theme_minimal()
grid.arrange(gp0,gp1,gp2)
Let’s add mean line and density plot on the histogram
# Add mean line
a<-ggplot(data = cars,aes(x = mpg))+
geom_histogram(aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 0.5) +
theme_minimal()+
geom_density(alpha=.2)+ #Add density curve
geom_vline(aes(xintercept=mean(mpg)),color="red", linetype="dashed", size=1)
b<-ggplot(data = cars,aes(x = mpg))+
geom_histogram(aes(y=..density..),
colour = "black",
fill = "thistle",
linetype = "dashed",
binwidth = 0.5) +
theme_minimal()+
geom_density(alpha=.2)+ #Add density curve
geom_vline(aes(xintercept=mean(mpg)),color="red", linetype="dashed", size=1)
grid.arrange(a,b)
Let’s Use facets to Split the plots into multiple panels:
b<-b+facet_grid(cyl~.)
plot(b)
The function geom_point() is used
gp1 <- ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=2, shape=21, fill= "thistle")+theme_minimal() +
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
gp1
As expected we can see that with increasing weight of the cars, the miles per gallon decreases
Let’s explore few of the shapes and point sizes in the scatter plot
gp2 <- ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=2, shape=21)+theme_minimal() +
labs(title = "Shape 1, Size 2") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
gp3 <- ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=3, shape=18)+theme_minimal() +
labs(title = "Shape 18, Size 3") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
gp4 <- ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=1, shape=13)+theme_minimal() +
labs(title = "Shape 13, Size 1") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
gp5 <- ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=4, shape=1)+theme_minimal() +
labs(title = "Shape 1, Size 4") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
grid.arrange(gp2,gp3,gp4,gp5, nrow=2, ncol=2)
Let’s color the points based on the number of cylinders in the car
cars$cyl<- as.factor(cars$cyl)
head(cars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
gp6 <- ggplot(data=cars, aes(x=wt, y=mpg, fill = cyl)) +
geom_point(size=2, shape=21)+theme_minimal() +
geom_label(label=rownames(cars), position=position_dodge(width=0.5),hjust=1,)+ #Add text next to each point
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
gp6
Add regression lines to the scatter plot
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
## geom_smooth: na.rm = FALSE, orientation = NA, se = TRUE
## stat_smooth: na.rm = FALSE, orientation = NA, se = TRUE, fullrange = FALSE, level = 0.95, method = auto
## position_identity
gp7<-ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=1, shape=22)+theme_minimal() +
geom_smooth(method=lm)+
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
#Let's get rid of the confidence interval
gp8<-ggplot(data=cars, aes(x=wt, y=mpg)) +
geom_point(size=1, shape=22)+theme_minimal() +
geom_smooth(method=lm, se = FALSE, linetype = "dashed")+
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
# Add regression lines
gp9<-ggplot(data=cars, aes(x=wt, y=mpg, col=cyl, shape =cyl)) +
geom_point(size=1, shape=22)+theme_minimal() +
geom_smooth(method=lm)+
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
# Remove confidence intervals
gp10<-ggplot(data=cars, aes(x=wt, y=mpg, col=cyl, shape =cyl)) +
geom_point(size=1, shape=22)+theme_minimal() +
geom_smooth(method=lm, se = FALSE)+
labs(title = "Weight of Cars V/S Miles Per Gallon") +
labs(x="Weight of Car")+labs(y="Miles Per Gallon")
grid.arrange(gp7,gp8,gp9,gp10, nrow =2)
It is also possible to change manually point and line colors using the functions :
scale_color_brewer() : to use color palettes from RColorBrewer package scale_color_grey() : to use grey color palettes
p <- ggplot(cars, aes(x=wt, y=mpg, color=cyl, shape=cyl)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
labs(x="Weight of Car")+labs(y="Miles Per Gallon")+
theme_minimal()
# Use brewer color palettes
a<-p+scale_color_brewer(palette="Dark2")
# Use grey scale
b<-p + scale_color_grey()
grid.arrange(a,b, nrow =1)