Introduction

ggplot2 package is based on Grammer of Graphics. It has two main functions for Data Visualization qplot and ggplot. Funtionalities of qplot are subset of functionalities of ggplot. It good to learn qplot but sooner or later everyone moves completely to ggplot.

  • Data : The data (dataframe) that is being visualized
  • Aesthetic Mappings : Mappings between variables in the data and components of the chart and how data are mapped to color, size
  • Geometric Objects (geom) : The geometric objects that are used to display the data. For example, scatter plots use geom_point, bar plots use geom_bar, and line plots use geom_abline

Above 3 are mandatory for a basic visualization. Further components are as below :

  • Scales : Scales control how variables are mapped to aesthetics. What scale an aesthetic map uses (e.g. male=red, female=green)
  • Coordinates : Coordinates describe how data is mapped to the plot e.g., use simple Cartesian coordinates with coord_cartesian, polar coordinates with coord_polar, or geographic projections with coord_map.
  • Statistical Transformations (stat) : Statistical transformations applied to the data to summarize the data e.g., boxplots use stat_boxplot, lines use stat_abline, and histograms use stat_bin.
  • Facets : Describes how the data is partitioned into subsets and how these different subsets are plotted.
  • Positional adjustments : Provides fine-grained control of where data is plotted.

ggplot2 package works on dataframe.

kind : str ‘line’ : line plot (default) ‘bar’ : vertical bar plot ‘barh’ : horizontal bar plot ‘hist’ : histogram ‘box’ : boxplot ‘kde’ : Kernel Density Estimation plot ‘density’ : same as ‘kde’ ‘area’ : area plot ‘pie’ : pie plot ‘scatter’ : scatter plot ‘hexbin’ : hexbin plot

Install/Load package

#install.packages("ggplot2") #uncomment this if package not already installed
library(ggplot2)
library(dplyr)
library(gcookbook) #for datasets from the book ggplot2 cookbook

Theme

#Horizontal adjust title
g <- g + theme(plot.title = element_text(hjust = 0.5))

# Black and white theme
g <- g +  theme_bw() 

# Remove legends
g <- g +  theme(legend.position="none")

Data Used

data(mtcars)
data(diamonds)
data(iris)
data(cabbage_exp)


#first 6 rows
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal     E     SI2      61.5   55.   326  3.95  3.98  2.43
## 2 0.210 Premium   E     SI1      59.8   61.   326  3.89  3.84  2.31
## 3 0.230 Good      E     VS1      56.9   65.   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4   58.   334  4.20  4.23  2.63
## 5 0.310 Good      J     SI2      63.3   58.   335  4.34  4.35  2.75
## 6 0.240 Very Good J     VVS2     62.8   57.   336  3.94  3.96  2.48
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(cabbage_exp)
##   Cultivar Date Weight        sd  n         se
## 1      c39  d16   3.18 0.9566144 10 0.30250803
## 2      c39  d20   2.80 0.2788867 10 0.08819171
## 3      c39  d21   2.74 0.9834181 10 0.31098410
## 4      c52  d16   2.26 0.4452215 10 0.14079141
## 5      c52  d20   3.11 0.7908505 10 0.25008887
## 6      c52  d21   1.47 0.2110819 10 0.06674995

Summary Statistics

str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

Scatter Plot

#Scatter Plot using data, aes() and geom()
g <- ggplot(data=mtcars, mapping=aes(x=wt, y=mpg))  # data and aesthitic mapping (aes)
g <- g + geom_point()                               # geometric objects (geom)
g

#summary(g)


#Plot using data, aes() and geom()
g <- ggplot(data=mtcars, mapping=aes(x=wt, y=mpg))  # data and aesthitic mapping (aes)
g <- g + geom_point(aes(col=factor(cyl)))               # geometric objects (geom)
g

#Plot using data, aes(), geom() and facets
g <- ggplot(data=mtcars, mapping=aes(x=wt, y=mpg))  # data and aesthitic mapping (aes)
g <- g + geom_point()   # geometric objects (geom)
g <- g + facet_grid(.~cyl)   # 
g

#Plot using data, aes(), geom() and facets
g <- ggplot(data=mtcars, mapping=aes(x=wt, y=mpg))  # data and aesthitic mapping (aes)
g <- g + geom_point(aes(col=factor(cyl)))   # geometric objects (geom)
g <- g + facet_grid(.~cyl)   # 
g

docs.ggplot2.org

Histogram

library(ggplot2)
e <- data.frame(f=rnorm(1000))
g <- ggplot(data=e, aes(x=f))
g <- g + geom_histogram(bins = 30)
g

#Histogram with Coloured fill and borders
g <- ggplot(data=e, aes(x=f))
g <- g + geom_histogram(bins = 30, fill="black", col="red")
g

df <- rnorm(2000)
g <- ggplot(data=NULL, aes(x=df))
g <- g + geom_histogram(bins = 30, fill="white", col="red")
g

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
ggplot(iris, aes(x=Sepal.Length))+ geom_histogram(bins=30, fill="blue", col="black")

Density Plot

#, eval=FALSE, include=FALSE
e <- data.frame(f=rnorm(1000))
# thehistogram <- qplot(x=f, data=e, geom="density")
# summary(thehistogram)

g <- ggplot(data=e, aes(x=f))
g <- g + geom_density()
g

Bar Plot

data(mtcars)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
no_cyl <- data.frame(table(mtcars$cyl))
barplot(table(mtcars$cyl))

g <- ggplot(no_cyl, aes(x=Var1, y=Freq, fill = Var1))
g <- g + geom_bar(stat = "identity")
g <- g  + labs(x="No. of Cylinders", ylab="Frequency", title="Bar Plot")
g

g <- ggplot(no_cyl, aes(x=Var1, y=Freq, fill = Var1))
g <- g + geom_bar(stat = "identity")
g <- g + labs(x = "Sample Means of 1000 Simulations")
g <- g + labs(y = "Frequency")
g <- g + ggtitle("Figure 2 \n Compareing Theoretical(Normal) and Sample Distribution")
g <- g + theme(plot.title = element_text(hjust = 0.5))
g

prop.table(table(mtcars$cyl)) * 100
## 
##      4      6      8 
## 34.375 21.875 43.750
library(gcookbook)
cabbage_exp
##   Cultivar Date Weight        sd  n         se
## 1      c39  d16   3.18 0.9566144 10 0.30250803
## 2      c39  d20   2.80 0.2788867 10 0.08819171
## 3      c39  d21   2.74 0.9834181 10 0.31098410
## 4      c52  d16   2.26 0.4452215 10 0.14079141
## 5      c52  d20   3.11 0.7908505 10 0.25008887
## 6      c52  d21   1.47 0.2110819 10 0.06674995
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
geom_bar(stat = "identity")

ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
geom_bar(stat = "identity", position="dodge")

#Kernet : https://www.kaggle.com/thie1e/rossmann-store-sales/exploratory-analysis-rossmann


# train <- read.csv("train.csv")
# train$Date <- as.Date(train$Date)
# train$Month <- month(train$Date)
# train$Day <- day(train$Date)
# train$Year <- year(train$Date)
# 
# #Day wise Sales
# day_sale <- aggregate(Sales~Day,data = train, sum)
# day_sale
# 
# library(ggplot2)
# d <- ggplot(day_sale, aes(factor(Day), Sales))
# d <- d + geom_bar(stat = "identity")
# d
# 
# d1 <- ggplot(train, aes(factor(Day))) + 
#   geom_bar() +
#   stat_summary_bin(aes(y = Sales), fun.y = "sum", geom = "bar")
# d1
# 
# library(gridExtra)
# grid.arrange(d,d1,ncol=2)

Vertical with Text

#Bar Plot
library(ggplot2)
head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal     E     SI2      61.5   55.   326  3.95  3.98  2.43
## 2 0.210 Premium   E     SI1      59.8   61.   326  3.89  3.84  2.31
## 3 0.230 Good      E     VS1      56.9   65.   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4   58.   334  4.20  4.23  2.63
## 5 0.310 Good      J     SI2      63.3   58.   335  4.34  4.35  2.75
## 6 0.240 Very Good J     VVS2     62.8   57.   336  3.94  3.96  2.48
color_count = diamonds %>% group_by(color) %>% summarize(count=n())

g = ggplot(color_count, aes(x=color, y=count, fill=color))
g = g + geom_bar(stat = "identity")
g = g + labs(x="Diamond Color",
             y="Diamond Count",
             title="Bar Plot for Color Diamonds")
g = g + geom_text(aes(label = count), hjust = 0.5, vjust = 2)
g

Horizontal with Text - 1

ggplot(color_count, aes(x = color,y = count)) +
  geom_bar(stat='identity',colour="white", aes(fill = color)) +
  geom_text(aes(x = color, y = 1, label = paste0("(",count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Color', 
       y = 'Count', 
       title = 'Count of Colours') +
  coord_flip() +
  theme_bw() +
  theme(legend.position="none")

Horizontal with Text - 2

#
g = ggplot(color_count, aes(x=color, y=count, fill=color))
g = g + geom_bar(stat = "identity")
g = g + labs(x="Diamond Color",
             y="Diamond Count",
             title="Bar Plot for Color Diamonds")
g = g + geom_text(aes(x=color, y=1, label = paste0(" ",count)), hjust = 0, vjust = 0, size = 4)
g = g + coord_flip()
g

Year      <- c(rep(c("2006-07", "2007-08", "2008-09", "2009-10"), each = 4))
Category  <- c(rep(c("A", "B", "C", "D"), times = 4))
Frequency <- c(168, 259, 226, 340, 216, 431, 319, 368, 423, 645, 234, 685, 166, 467, 274, 251)
Data      <- data.frame(Year, Category, Frequency)

ggplot(Data, aes(x = Year, y = Frequency, fill = Category, label = Frequency)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3, position = position_stack(vjust = 0.5))

# g = ggplot(block_count, aes(blocks, busses, fill=blocks)) 
# g = g + geom_bar(stat = "identity")
# g = g + geom_text(aes(label = busses), hjust = 0.5, vjust = 2)
# g

Facet Grid, Facet Wrap

#facet_grid, facet_wrap
ggplot(mtcars, aes(wt,mpg)) + geom_point()

ggplot(mtcars, aes(wt,mpg)) + geom_point() + facet_grid(.~cyl)

ggplot(mtcars, aes(wt,mpg)) + geom_point() + facet_wrap(~cyl,ncol=1)

g <- ggplot(mpg, aes(displ, hwy))
g + geom_point(alpha=1/3) + facet_grid(.~class)

g + geom_point(alpha=1/3) + facet_wrap(~class, ncol=1)

Others

# load data
data(sleep)
library(ggplot2)
# plot the first and second observations
g <- ggplot(sleep, aes(x = group, y = extra, group = factor(ID)))
g <- g + geom_line(size = 1, aes(colour = ID)) + geom_point(size =10, pch = 21, fill = "salmon", alpha = 0.5)
g