This is a introductory tutorial to get you started with Visualization data and Exploring Data with R. There are some popular books and many online materials i will Provide the links and references at the end of the tutorial.
library(ggplot2)
library(gcookbook)
plot(cars$dist~cars$speed, # y~x
main="Relationship between car distance & speed", #Plot Title
xlab="Speed (miles per hour)", #X axis title
ylab="Distance travelled (miles)", #Y axis title
xlim=c(0,30), #Set x axis limits from 0 to 30 ylim=c(0,140), #Set y axis limits from 0 to 30140 xaxs="i", #Set x axis style as internal
yaxs="i", #Set y axis style as internal
col="red", #Set the colour of plotting symbol to red
pch=19) #Set the plotting symbol to filled dots
Let's draw vertical error bars with 5% errors on our cars scatterplot using arrows function
plot(mpg ~ disp, data = mtcars)
arrows(x0 = mtcars$disp, y0 = mtcars$mpg * 0.95, x1 = mtcars$disp, y1 = mtcars$mpg *
1.05, angle = 90, code = 3, length = 0.04, lwd = 0.4)
How to draw histograms in the top and right margins of a bivariate scatter plot
layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), widths=c(3,1), heights=c(1,3), TRUE)
par(mar=c(5.1,4.1,0.1,0))
plot(cars$dist~cars$speed, # y~x
xlab="Speed (miles per hour)", #X axis title
ylab="Distance travelled (miles)", #Y axis title
xlim=c(0,30), #Set x axis limits from 0 to 30 ylim=c(0,140), #Set y axis limits from 0 to 30140 xaxs="i", #Set x axis style as internal
yaxs="i", #Set y axis style as internal
col="red", #Set the colour of plotting symbol to red
pch=19) #Set the plotting symbol to filled dots
par(mar=c(0,4.1,3,0))
hist(cars$speed,ann=FALSE,axes=FALSE,col="black",border="white")
yhist <- hist(cars$dist,plot=FALSE)
par(mar=c(5.1,0,0.1,1))
barplot(yhist$density,
horiz=TRUE,space=0,axes=FALSE,
col="black",border="white")
Using ggplot library
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point()
## Multiple lines in a plot
plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure)
lines(pressure$temperature, pressure$pressure/2, col = "red")
points(pressure$temperature, pressure$pressure/2, col = "red")
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line()
# Lines and points together
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point()
# Showing Lines Along the Axes
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point() +
theme(axis.line = element_line(colour = "black"))
# Logarithmic axis
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point() +
theme(axis.line = element_line(colour = "black")) + scale_x_log10() + scale_y_log10()
From library(gcookbook) I am using heightweight dataset to group data points by variables, The grouping variable must be categorical—in other words, a factor or character vector.
ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point()
# Other shapes and color can be used by scale_shape_manual()
# scale_colour_manual()
# Change shape of points
ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point(shape = 3)
# Change point size sex is categorical
ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex)) + geom_point(size = 3) +
scale_shape_manual(values = c(1, 4))
# Represent a third continuous variable using color or size.
ggplot(heightweight, aes(x = weightLb, y = heightIn, fill = ageYear)) + geom_point(shape = 21,
size = 2.5) + scale_fill_gradient(low = "black", high = "white", breaks = 12:17,
guide = guide_legend())
Adding Fitted Regression Model Lines
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
sp + geom_point() + stat_smooth(method = lm)
# Adding annotations to regression plot
model <- lm(heightIn ~ ageYear, heightweight)
summary(model)
# First generate prediction data Given a model, predict values of yvar from
# xvar This supports one predictor and one predicted variable xrange: If
# NULL, determine the x range from the model object. If a vector with two
# numbers, use those as the min and max of the prediction range. samples:
# Number of samples across the x range. ...: Further arguments to be passed
# to predict()
predictvals <- function(model, xvar, yvar, xrange = NULL, samples = 100, ...) {
# If xrange isn't passed in, determine xrange from the models. Different
# ways of extracting the x range, depending on model type
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]]) else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point() +
geom_line(data = pred)
sp + annotate("text", label = "r^2 == 0.42", x = 16.5, y = 52, parse = TRUE)
Scatter plot matrix and correlation matrix using mtcars dataset and first five variables
library(corrplot)
pairs(mtcars[, 1:5])
# Scatter plot with correlations in the upper triangle, smoothing lines in
# the lower triangle, and histograms on the diagonal
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use = "complete.obs"))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste(prefix, txt, sep = "")
if (missing(cex.cor))
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r)/2)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "white", ...)
}
pairs(mtcars[, 1:5], upper.panel = panel.cor, diag.panel = panel.hist, lower.panel = panel.smooth)
mcor <- cor(mtcars)
corrplot(mcor)
# Correlation matrix with colored squares and black, rotated labels
corrplot(mcor, method = "shade", shade.col = NA, tl.col = "black", tl.srt = 45)
# create a three-dimensional (3D) scatter plot.
library(rgl)
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg, type = "s", size = 0.75, lit = FALSE)
# add vertical segments to help give a sense of the spatial positions of the
# points
interleave <- function(v1, v2) as.vector(rbind(v1, v2))
# Plot the points
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg, xlab = "Weight", ylab = "Displacement",
zlab = "MPG", size = 0.75, type = "s", lit = FALSE)
# Add the segments
segments3d(interleave(mtcars$wt, mtcars$wt), interleave(mtcars$disp, mtcars$disp),
interleave(mtcars$mpg, min(mtcars$mpg)), alpha = 0.4, col = "blue")
Scattter plot with jitter rugs,spikes and density
x <- rnorm(1000, 50, 30)
y <- 3 * x + rnorm(1000, 0, 20)
require(Hmisc)
plot(x, y)
# scat1d adds tick marks (bar codes. rug plot) on any of the four sides of
# an existing plot, corresponding with non-missing values of a vector x.
scat1d(x, col = "red") # density bars on top of graph
scat1d(y, 4, col = "blue") # density bars at right
plot(x, y, pch = 19)
histSpike(x, add = TRUE, col = "green4", lwd = 2)
histSpike(y, 4, add = TRUE, col = "blue", lwd = 2)
histSpike(x, type = "density", col = "red", add = TRUE) # smooth density at bottom
histSpike(y, 4, type = "density", col = "red", add = TRUE)
barplot(BOD$demand, names.arg = BOD$Time)
# Using the table function
barplot(table(mtcars$cyl))
qplot(BOD$Time, BOD$demand, geom = "bar", stat = "identity")
# Conisdering facotr
qplot(factor(BOD$Time), BOD$demand, geom = "bar", stat = "identity")
# cyl is continuous here
qplot(mtcars$cyl)
# Treat cyl as discrete
qplot(factor(mtcars$cyl))
# Bar graph of values. This uses the BOD data frame, with the 'Time' column
# for x values and the 'demand' column for y values.
ggplot(BOD, aes(x = Time, y = demand)) + geom_bar(stat = "identity")
ggplot(mtcars, aes(x = factor(cyl))) + geom_bar(fill = "white", color = "black")
# Specify approximate number of bins with breaks
ggplot(mtcars, aes(x = mpg)) + geom_histogram(binwidth = 4, fill = "white",
colour = "black")
# Change the x axis origin using origin parameter
ggplot(mtcars, aes(x = mpg)) + geom_histogram(binwidth = 4, fill = "white",
colour = "black", origin = 20)
Histograms of multiple groups of data
library(MASS)
ggplot(heightweight, aes(x = heightIn)) + geom_histogram(fill = "white", colour = "black") +
facet_grid(sex ~ .)
hw <- heightweight
# Using plyr and revalue() to change the names on sex variable
library(plyr)
hw$sex <- revalue(hw$sex, c(f = "Female", m = "Male"))
# Using facetting
ggplot(hw, aes(x = heightIn)) + geom_histogram(fill = "white", colour = "black") +
facet_grid(sex ~ .)
ggplot(hw, aes(x = heightIn, y = ..density.., fill = sex)) + geom_histogram(position = "identity",
alpha = 0.4) + theme_bw() + geom_density(alpha = 0.3)
Negative and Positive Bar plot
csub <- subset(climate, Source == "Berkeley" & Year >= 1900)
head(csub)
csub$pos <- csub$Anomaly10y >= 0
ggplot(csub, aes(x = Year, y = Anomaly10y, fill = pos)) + geom_bar(stat = "identity",
color = "black", position = "identity")
Error Bar plot in ggplot2
myd <- data.frame(X = c(1:12, 1:12), Y = c(8, 12, 13, 18, 22, 16, 24, 29, 34,
15, 8, 6, 9, 10, 12, 18, 26, 28, 28, 30, 20, 10, 9, 9), group = rep(c("X-Group",
"Y-group"), each = 12), error = rep(c(2.5, 3), each = 12))
plt = ggplot(data = myd, aes(x = X, y = Y, fill = group, width = 0.8)) + geom_errorbar(aes(ymin = Y,
ymax = Y + error, width = 0.2), position = position_dodge(width = 0.8)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8)) + geom_bar(stat = "identity",
position = position_dodge(width = 0.8), colour = "black", legend = FALSE) +
scale_fill_manual(values = c("grey70", "white")) + scale_x_discrete("X",
limits = c(1:12)) + scale_y_continuous("Y (units)", expand = c(0, 0), limits = c(0,
40), breaks = seq(0, 40, by = 5)) + ggtitle("My nice plot") + theme_bw() +
theme(plot.title = element_text(face = "bold", size = 14), axis.title.x = element_text(face = "bold",
size = 12), axis.title.y = element_text(face = "bold", size = 12, angle = 90),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(angle = 90, hjust = 0.5), legend.title = element_blank(),
legend.position = c(0.85, 0.85), legend.key.size = unit(1.5, "lines"),
legend.key = element_rect())
plt
# Using the ToothGrowth dataset Formula syntax
boxplot(len ~ supp, data = ToothGrowth)
# Put interaction of two variables on x-axis
boxplot(len ~ supp + dose, data = ToothGrowth)
ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot()
# Adding notches
ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot(notch = TRUE)
# Adding mean
ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot() + stat_summary(fun.y = "mean",
geom = "point", shape = 24, size = 4, fill = "white")
# Using three separate vectors
ggplot(ToothGrowth, aes(x = interaction(supp, dose), y = len)) + geom_boxplot()
Violin plots are a way of comparing multiple data distributions
# Use the heightweight datasets
p <- ggplot(heightweight, aes(x = sex, y = heightIn))
p + geom_violin(trim = FALSE, adjuts = 2) + geom_boxplot(width = 0.1, fill = "Grey",
outlier.colour = NA) + theme_bw() + stat_summary(fun.y = "mean", geom = "point",
shape = 24, size = 4, fill = "white")
curve(x^3 - 5 * x, from = -4, to = 4)
# Plot a user-defined function
myfun <- function(xvar) {
1/(1 + exp(-xvar + 10))
}
curve(myfun(x), from = 0, to = 20)
# Add a line:
curve(1 - myfun(x), add = TRUE, col = "red")
# This sets the x range from 0 to 20
ggplot(data.frame(x = c(0, 20)), aes(x = x)) + stat_function(fun = myfun, geom = "line")
Making Density Plot of Two-Dimensional Data
p <- ggplot(faithful, aes(x = eruptions, y = waiting))
p + geom_point() + stat_density2d()
p + stat_density2d(aes(colour = ..level..))
p + stat_density2d(aes(fill = ..density..), geom = "raster", contour = FALSE)
# With points, and map density estimate to alpha
p + geom_point() + stat_density2d(aes(alpha = ..density..), geom = "tile", contour = FALSE)
Plotting Pie Charts
library(RColorBrewer)
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie(slices, labels = lbls, main = "Pie Chart of Countries", col = brewer.pal(7,
"Set1"))
Pie Chart with Percentages
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pct <- round(slices/sum(slices) * 100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls, "%", sep = "") # ad % to labels
pie(slices, labels = lbls, col = rainbow(length(lbls)), main = "Pie Chart of US States")
3D Pie chart
library(plotrix)
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie3D(slices, labels = lbls, explode = 0.1, main = "Pie Chart of Countries ",
col = brewer.pal(7, "Set1"))
A dendrogram is the fancy word that we use to name a tree diagram to display the groups formed by hierarchical clustering.
library(corrgram)
R <- cor(mtcars)
# default corrgram
corrgram(R)
# corrgram with pie charts
corrgram(R, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie,
text.panel = panel.txt, main = "mtcars Data")
The package ellipse provides the function plotcorr() that helps us to visualize correlations. plotcorr() uses ellipse-shaped glyphs for each entry of the correlation matrix. Here's the default plot using our matrix of R:
# default corrgram
library(ellipse)
plotcorr(R)
# colored corrgram
plotcorr(R, col = colorRampPalette(c("firebrick3", "white", "navy"))(10))
Another colored corrgram
plotcorr(R, col = colorRampPalette(c("#E08214", "white", "#8073AC"))(10), type = "lower")
# prepare hierarchical cluster
hc = hclust(dist(mtcars))
plot(hc, hang = -1) ## labels at the same level
An alternative way to produce dendrograms is to specifically convert hclust objects into dendrograms objects.
# using dendrogram objects
hcd = as.dendrogram(hc)
# alternative way to get a dendrogram
plot(hcd)
Having an object of class dendrogram, we can also plot the branches in a triangular form.
# using dendrogram objects
plot(hcd, type = "triangle")
library(ape)
# plot basic tree
plot(as.phylo(hc), cex = 0.9, label.offset = 1)
# fan
plot(as.phylo(hc), type = "fan")
# add colors randomly
plot(as.phylo(hc), type = "fan", tip.color = hsv(runif(15, 0.65, 0.95), 1, 1,
0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20,
0.5, 3), use.edge.length = TRUE, col = "gray80")
Triple heat map plot
library(reshape2)
library(grid)
library(ggplot2)
# X axis quantitaive ggplot data
datfx <- data.frame(indv = factor(paste("ID", 1:20, sep = ""), levels = rev(paste("ID",
1:20, sep = ""))), matrix(sample(LETTERS[1:7], 80, T), ncol = 4))
# converting data to long form for ggplot2 use
datf1x <- melt(datfx, id.var = "indv")
plotx <- ggplot(datf1x, aes(indv, variable)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_manual(values = terrain.colors(7)) + scale_x_discrete(expand = c(0,
0))
px <- plotx
# Y axis quantitaive ggplot data
datfy <- data.frame(indv = factor(paste("ID", 21:40, sep = ""), levels = rev(paste("ID",
21:40, sep = ""))), matrix(sample(LETTERS[7:10], 100, T), ncol = 5))
# converting data to long form for ggplot2 use
datf1y <- melt(datfy, id.var = "indv")
ploty <- ggplot(datf1y, aes(variable, indv)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_manual(values = c("cyan4", "midnightblue",
"green2", "lightgreen")) + scale_x_discrete(expand = c(0, 0))
py <- ploty + theme(legend.position = "left", axis.title = element_blank())
# plot XY quantative fill
datfxy <- data.frame(indv = factor(paste("ID", 1:20, sep = ""), levels = rev(paste("ID",
1:20, sep = ""))), matrix(rnorm(400, 50, 10), ncol = 20))
names(datfxy) <- c("indv", paste("ID", 21:40, sep = ""))
datfxy <- melt(datfxy, id.var = "indv")
levels(datfxy$variable) <- rev(paste("ID", 21:40, sep = ""))
pxy <- plotxy <- ggplot(datfxy, aes(indv, variable)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_gradient(low = "red", high = "yellow") +
theme(axis.title = element_blank())
# Define layout for the plots (2 rows, 2 columns)
layt <- grid.layout(nrow = 2, ncol = 2, heights = c(6/8, 2/8), widths = c(2/8,
6/8), default.units = c("null", "null"))
# View the layout of plots
grid.show.layout(layt)
# Draw plots one by one in their positions
grid.newpage()
pushViewport(viewport(layout = layt))
print(py, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(pxy, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
print(px, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))
Mosaic plot for categorical data
myd <- data.frame(fact1 = sample(c("A", "B", "C", "D"), 200, replace = TRUE),
fact2 = sample(c("HL", "PS", "DS"), 200, replace = TRUE), fact3 = sample(c("Male",
"Female"), 200, replace = TRUE))
# plot vcd package is for visualization of categorical data
require(vcd)
mytable <- table(myd)
mosaic(mytable, shade = TRUE, legend = TRUE)