Data visualization is a technique used for the graphical representation of data. By using elements like scatter plots, charts, graphs, histograms, maps, etc., we make our data more understandable. Data visualization makes it easy to recognize patterns, trends, and exceptions in our data. It enables us to convey information and results in a quick and visual way. It is easier for a human brain to understand and retain information when it is represented in a pictorial form. Therefore, Data Visualization helps us interpret data quickly, examine different variables to see their effects on the patterns, and derive insights from our data.
A caption
R Pie Charts
R programming language has several libraries for creating charts and graphs. A pie-chart is a representation of values in the form of slices of a circle with different colors. Slices are labeled with a description, and the numbers corresponding to each slice are also shown in the chart. However, pie charts are not recommended in the R documentation, and their characteristics are limited. The authors recommend a bar or dot plot on a pie chart because people are able to measure length more accurately than volume.
The Pie charts are created with the help of pie () function, which takes positive numbers as vector input. Additional parameters are used to control labels, colors, titles, etc.
There is the following syntax of the pie() function:
pie(X, Labels, Radius, Main, Col, Clockwise)
# Creating data for the graph.
x = c(68, 52, 15, 50)
labels = c("India", "America", "Sri Lanka", "Nepal")
pie(x, labels)
Adding title and changing colors of the plot
pie(x, labels,
main = "Literacy rate of countries",
col = c('salmon', 'violet', 'seagreen', 'lightblue'))
Slice Percentage and Chart Legend
There are two additional properties of the pie chart, i.e., slice percentage and chart legend. We can show the data in the form of percentage as well as we can add legends to plots in R by using the legend() function. There is the following syntax of the legend() function.
legend(x,y=NULL,legend,fill,col,bg)
pie_percent = round(100*x/sum(x), 1)
pie_percent
## [1] 36.8 28.1 8.1 27.0
cols = c('salmon', 'violet', 'seagreen', 'lightblue')
pie(x, labels = pie_percent,
main = "Literacy rate of countries",
col = cols)
legend("topright",
c("India", "America", "Sri Lanka", "Nepal"),
cex = 0.8,
fill = cols)
3 Dimensional Pie Chart
In R, we can also create a three-dimensional pie chart. For this purpose, R provides a plotrix package whose pie3D() function is used to create an attractive 3D pie chart. The parameters of pie3D() function remain same as pie() function. Let’s see an example to understand how a 3D pie chart is created with the help of this function.
library(plotrix)
pie3D(x, labels = pie_percent,
explode = 0.1,
main = "3D pie-chart for literacy rate")
pie3D(x, labels = pie_percent,
explode = 0,
main = "3D pie-chart for literacy rate")
pie3D(x, labels = pie_percent,
explode = 1,
main = "3D pie-chart for literacy rate")
R Bar Charts
A bar chart is a pictorial representation in which numerical values of variables are represented by length or height of lines or rectangles of equal width. A bar chart is used for summarizing a set of categorical data. In bar chart, the data is shown through rectangular bars having the length of the bar proportional to the value of the variable.
In R, we can create a bar chart to visualize the data in an efficient manner. For this purpose, R provides the barplot() function, which has the following syntax:
barplot(h,x,y,main, names.arg,col)
H = c(12,35,54,3,41)
barplot(H)
H = c(12,35,54,3,41)
M = c("Feb","Mar","Apr","May","Jun")
# Plotting the bar chart
barplot(H,
names.arg = M,
xlab = "Month",
ylab = "Revenue",
col = "salmon2",
main = "Revenue Bar chart",
border = "black")
library(RColorBrewer)
months = c("Jan","Feb","Mar","Apr","May")
regions = c("West","North","South")
# Creating the matrix of the values.
Values = matrix(c(21,32,33,14,95,46,67,78,39,11,22,23,94,15,16), nrow = 3, ncol = 5, byrow = TRUE)
View(Values)
barplot(Values,
main = "Total Revenue",
names.arg = months,
xlab = "Month",
ylab = "Revenue",
col = c("cadetblue3", "deeppink2", "goldenrod1"))
legend("topleft",
regions,
cex = 1.3,
fill = c("cadetblue3", "deeppink2", "goldenrod1"))
Barplots with ggplot2
df = data.frame(dose=c("D0.5", "D1", "D2"),
len=c(4.2, 10, 29.5))
library(ggplot2)
# Basic barplot
p = ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity")
p
# Horizontal bar plot
p + coord_flip()
# Change the width of bars
ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity", width = 0.5)
# Change colors
ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity", color = "lightblue", fill = "white")
# Minimal theme + blue fill color
p = ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal()
p
# Outside bars
ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = len), vjust = -0.3, size = 3.5) +
theme_minimal()
# Inside bars
ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = len), vjust = 1.6, color = "white", size = 3.5) +
theme_minimal()
# Change barplot line colors by groups
p = ggplot(df, aes(x = dose, y = len, color = dose)) +
geom_bar(stat = "identity", fill = "white")
p
# Use custom color palettes
p + scale_color_manual(values = c("salmon2", "seagreen", "violet"))
# Use brewer color palettes
p + scale_color_brewer(palette = "Dark2")
# Use grey scale
p + scale_color_grey() + theme_classic()
# Change barplot fill colors by groups
p = ggplot(df, aes(x = dose, y = len, fill = dose)) +
geom_bar(stat = "identity") + theme_minimal()
p
# Use custom color palettes
p + scale_fill_manual(values = c("salmon2", "seagreen", "violet"))
# Use brewer color palettes
p + scale_fill_brewer(palette = "Dark2")
# Use grey scale
p + scale_fill_grey()
#Use black outline
ggplot(df, aes(x = dose, y = len, fill = dose)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(values = c("salmon2", "seagreen", "violet")) +
theme_minimal()
# Change bar fill colors to blues
p = p + scale_fill_brewer(palette = "Blues")
p + theme(legend.position = "top")
p + theme(legend.position = "bottom")
# Remove legend
p + theme(legend.position = "none")
p + theme(text = element_text(size=20),
axis.text.x = element_text(angle=90, hjust=1))
#Change the order of the bars
p + scale_x_discrete(limits = c("D2", "D0.5", "D1"))
Barplots with matrices
View(mtcars)
# Don't map a variable to y
ggplot(data = mtcars, aes(x = factor(cyl))) +
geom_bar(stat = "count", width = 0.7, fill = "steelblue") +
theme_minimal()
as.data.frame(table(mtcars$cyl))
## Var1 Freq
## 1 4 11
## 2 6 7
## 3 8 14
Stacked barplots in ggplot2
df2 = data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
View(df2)
# Stacked barplot with multiple groups
ggplot(data = df2, aes(x = dose, y = len, fill = supp)) +
geom_bar(stat = "identity")
# Use position=position_dodge()
ggplot(data = df2, aes(x = dose, y = len, fill = supp)) +
geom_bar(stat = "identity", position = position_dodge())
# Change the colors manually
p = ggplot(data = df2, aes(x = dose, y = len, fill = supp)) +
geom_bar(stat = "identity", color = "black", position = position_dodge())+
theme_minimal()
# Use custom colors
p + scale_fill_manual(values = c('#999999', '#E69F00'))
# Use brewer color palettes
p + scale_fill_brewer(palette = "Blues")
ggplot(data = df2, aes(x = dose, y = len, fill = supp)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = len), vjust = 1.6, color = "white",
position = position_dodge(0.9), size = 3.5) +
scale_fill_brewer(palette="Paired") +
theme_minimal()
Barplot with error bars
The helper function below will be used to calculate the mean and the standard deviation, for the variable of interest, in each group :
#+++++++++++++++++++++++++
# Function to calculate the mean and the standard deviation
# for each group
#+++++++++++++++++++++++++
# data : a data frame
# varname : the name of a column containing the variable
#to be summariezed
# groupnames : vector of column names to be used as
# grouping variables
data_summary = function(data, varname, groupnames){
require(plyr)
summary_func = function(x, col){
c(mean = mean(x[[col]], na.rm = TRUE),
sd = sd(x[[col]], na.rm = TRUE))
}
data_sum = ddply(data, groupnames, .fun = summary_func,
varname)
data_sum = rename(data_sum, c("mean" = varname))
return(data_sum)
}
df3 = data_summary(ToothGrowth, varname = "len",
groupnames = c("supp", "dose"))
## Loading required package: plyr
# Convert dose to a factor variable
df3$dose = as.factor(df3$dose)
View(df3)
# Standard deviation of the mean as error bar
p = ggplot(df3, aes(x = dose, y = len, fill = supp)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_errorbar(aes(ymin = len-sd, ymax = len+sd), width =.2,
position=position_dodge(.9))
p + scale_fill_brewer(palette="Paired") + theme_minimal()
Boxplots in ggplot2
mat = data.frame(Grade = c(rep("Grade 2", 24), rep("Grade 3", 31), rep("Grade 4", 15)), BIRC5 = 1:70)
View(mat)
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
#basic boxplot
ggplot(mat, aes(x = Grade, y = BIRC5)) + geom_boxplot()
#Color the boxes
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot()
#Make the background white
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic()
#Add pvalues using anova
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(method = "anova")
my_comparisons = list(c("Grade 2", "Grade 3"), c("Grade 2", "Grade 4"), c("Grade 3", "Grade 4"))
#Add multiple pvalues
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(comparisons = my_comparisons)
#Add labels
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(comparisons = my_comparisons) + ggtitle("BIRC5 gene expression")
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(comparisons = my_comparisons) + ggtitle("BIRC5 gene expression") + xlab("Glioma grades") + ylab("Log2 CPM expression")
#Remove legend
p = ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(comparisons = my_comparisons) + ggtitle("BIRC5 gene expression") + xlab("Glioma grades") + ylab("Log2 CPM expression") + theme(legend.position = "none")
p
p + geom_jitter(shape=16, position=position_jitter(0.2))
p + geom_jitter(shape=16, position=position_jitter(0.1))
#Store the images
png("BIRC5.png")
ggplot(mat, aes(x = Grade, y = BIRC5, fill = Grade)) + geom_boxplot() + theme_classic() + stat_compare_means(comparisons = my_comparisons) + ggtitle("BIRC5 gene expression") + xlab("Glioma grades") + ylab("Log2 CPM expression") + theme(legend.position = "none")
dev.off()
## png
## 2
Violin plots in ggplot2
Violin plots are similar to box plots, except that they also show the kernel probability density of the data at different values. Typically, violin plots will include a marker for the median of the data and a box indicating the interquartile range, as in standard box plots.
p = ggplot(mat, aes(x = Grade, y = BIRC5)) +
geom_violin()
p
p + stat_summary(fun = median, geom = "point", size = 2, color = "red")
p = ggplot(mat, aes(x = Grade, y = BIRC5)) +
geom_violin(trim = F)
p
#add boxplots within the violin plot
p + geom_boxplot(width=0.1)
Scatterplots in R
Basic scatterplot
a = 1:10
b = 11:20
c = c(-1:-10)
plot(a, b)
plot(a, b, pch = 20)
#Symbols for all the shapes in R
ggpubr::show_point_shapes()
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
plot(a, c, col = "red", pch = 18)
plot(a, c, col = "red", pch = 18, cex = 5)
plot(a, c, col = "red", pch = 18, cex = 5)
plot(a, c, col = "red", pch = 18, cex = 5, xlab = "Exercise hours", ylab = "Weight")
GGplot2 scatterplots
df = mtcars[, c("mpg", "cyl", "wt")]
df$cyl = as.factor(df$cyl)
head(df)
## mpg cyl wt
## Mazda RX4 21.0 6 2.620
## Mazda RX4 Wag 21.0 6 2.875
## Datsun 710 22.8 4 2.320
## Hornet 4 Drive 21.4 6 3.215
## Hornet Sportabout 18.7 8 3.440
## Valiant 18.1 6 3.460
# Basic scatter plot
ggplot(df, aes(x = wt, y = mpg)) +
geom_point()
# Change the point shape
ggplot(df, aes(x = wt, y = mpg)) +
geom_point(shape = 18)
# change shape, color, fill, size
ggplot(df, aes(x = wt, y = mpg)) +
geom_point(shape = 23, fill = "blue", color = "darkred", size = 3)
ggplot(df, aes(x = wt, y = mpg)) +
geom_point(shape = 18) + stat_cor(method = "pearson")
Heatmaps in R
A single heatmap is the most used approach for visualizing the data. Although “the shining point” of the ComplexHeatmap package is that it can visualize a list of heatmaps in parallel, however, as the basic unit of the heatmap list, it is still very important to have the single heatmap well configured.
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.13
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
set.seed(123)
nr1 = 4; nr2 = 8; nr3 = 6; nr = nr1 + nr2 + nr3
nc1 = 6; nc2 = 8; nc3 = 10; nc = nc1 + nc2 + nc3
mat = cbind(rbind(matrix(rnorm(nr1*nc1, mean = 1, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc1, mean = 0, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc1, mean = 0, sd = 0.5), nr = nr3)),
rbind(matrix(rnorm(nr1*nc2, mean = 0, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc2, mean = 1, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc2, mean = 0, sd = 0.5), nr = nr3)),
rbind(matrix(rnorm(nr1*nc3, mean = 0.5, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc3, mean = 0.5, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc3, mean = 1, sd = 0.5), nr = nr3))
)
mat = mat[sample(nr, nr), sample(nc, nc)] # random shuffle rows and columns
rownames(mat) = paste0("row", seq_len(nr))
colnames(mat) = paste0("column", seq_len(nc))
View(mat)
Heatmap(mat)
#change color
col_fun = colorRamp2(c(-2, 0, 2), c("green", "white", "red"))
Heatmap(mat, col = col_fun)
Heatmap(mat, col = col_fun, name = "Sample heatmap")
#Deal with missing values in Heatmap
mat_with_na = mat
na_index = sample(c(TRUE, FALSE), nrow(mat)*ncol(mat), replace = TRUE, prob = c(1, 9))
mat_with_na[na_index] = NA
Heatmap(mat_with_na, name = "mat", na_col = "black",
column_title = "a matrix with NA values")
#Titles
Heatmap(mat, name = "mat", column_title = "I am a column title",
row_title = "I am a row title")
Heatmap(mat, name = "mat", column_title = "I am a big column title",
column_title_gp = gpar(fontsize = 20, fontface = "bold"))
#Clustering in heatmap
Heatmap(mat, name = "mat", cluster_rows = FALSE) # turn off row clustering
Heatmap(mat, name = "mat", show_column_dend = FALSE) # hide column dendrogram
Heatmap(mat, name = "mat", row_dend_side = "right", column_dend_side = "bottom")
Heatmap(mat, name = "mat", column_dend_height = unit(4, "cm"),
row_dend_width = unit(4, "cm"))
#Dimension names
Heatmap(mat, name = "mat", row_names_side = "left", row_dend_side = "right",
column_names_side = "top", column_dend_side = "bottom")
Heatmap(mat, name = "mat", show_row_names = FALSE)
Heatmap(mat, name = "mat", row_names_gp = gpar(fontsize = 20))
Heatmap(mat, name = "mat", row_names_centered = TRUE, column_names_centered = TRUE)
Heatmap(mat, name = "mat", column_names_rot = 45)
Heatmap(mat, name = "mat", column_names_rot = 45, column_names_side = "top",
column_dend_side = "bottom")
# Split by k-means clustering
Heatmap(mat, name = "mat", row_km = 2)
Heatmap(mat, name = "mat", column_km = 3)
#Size of the heatmap
Heatmap(mat, name = "mat", width = unit(8, "cm"), height = unit(8, "cm"))
Heatmap(mat, name = "mat", heatmap_width = unit(8, "cm"), heatmap_height = unit(8, "cm"))
#Heatmap Annotations
set.seed(123)
mat = matrix(rnorm(100), 10)
rownames(mat) = paste0("R", 1:10)
colnames(mat) = paste0("C", 1:10)
column_ha = HeatmapAnnotation(foo1 = runif(10), bar1 = anno_barplot(runif(10)))
row_ha = rowAnnotation(foo2 = runif(10), bar2 = anno_barplot(runif(10)))
Heatmap(mat, name = "mat", top_annotation = column_ha, right_annotation = row_ha)
Heatmap(mat, name = "mat", bottom_annotation = column_ha, left_annotation = row_ha)