Exploring Data with R

This is a introductory tutorial to get you started with Visualization data and Exploring Data with R. There are some popular books and many online materials i will Provide the links and references at the end of the tutorial.

library(ggplot2)
library(gcookbook)

1. Scatter Plots and line plots

plot(cars$dist~cars$speed, # y~x
     main="Relationship between car distance & speed", #Plot Title
     xlab="Speed (miles per hour)", #X axis title
     ylab="Distance travelled (miles)", #Y axis title
     xlim=c(0,30), #Set x axis limits from 0 to 30 ylim=c(0,140), #Set y axis limits from 0 to  30140  xaxs="i", #Set x axis style as internal 
     yaxs="i", #Set y axis style as internal  
     col="red", #Set the colour of plotting symbol to red 
     pch=19) #Set the plotting symbol to filled dots

plot of chunk unnamed-chunk-2

Let's draw vertical error bars with 5% errors on our cars scatterplot using arrows function


plot(mpg ~ disp, data = mtcars)
arrows(x0 = mtcars$disp, y0 = mtcars$mpg * 0.95, x1 = mtcars$disp, y1 = mtcars$mpg * 
    1.05, angle = 90, code = 3, length = 0.04, lwd = 0.4)

plot of chunk unnamed-chunk-3

How to draw histograms in the top and right margins of a bivariate scatter plot


 layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), widths=c(3,1), heights=c(1,3), TRUE)
 par(mar=c(5.1,4.1,0.1,0))
 plot(cars$dist~cars$speed, # y~x
      xlab="Speed (miles per hour)", #X axis title
      ylab="Distance travelled (miles)", #Y axis title
      xlim=c(0,30), #Set x axis limits from 0 to 30 ylim=c(0,140), #Set y axis limits from 0 to  30140  xaxs="i", #Set x axis style as internal 
      yaxs="i", #Set y axis style as internal  
      col="red", #Set the colour of plotting symbol to red 
      pch=19) #Set the plotting symbol to filled dots

 par(mar=c(0,4.1,3,0))
 hist(cars$speed,ann=FALSE,axes=FALSE,col="black",border="white")

 yhist <- hist(cars$dist,plot=FALSE)
 par(mar=c(5.1,0,0.1,1))
 barplot(yhist$density,
         horiz=TRUE,space=0,axes=FALSE,
         col="black",border="white")

plot of chunk unnamed-chunk-4


Using ggplot library


ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point()

plot of chunk unnamed-chunk-5


## Multiple lines in a plot
plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure)

lines(pressure$temperature, pressure$pressure/2, col = "red")
points(pressure$temperature, pressure$pressure/2, col = "red")

plot of chunk unnamed-chunk-6


ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line()

plot of chunk unnamed-chunk-7


# Lines and points together
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point()

plot of chunk unnamed-chunk-7


# Showing Lines Along the Axes
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point() + 
    theme(axis.line = element_line(colour = "black"))

plot of chunk unnamed-chunk-7

# Logarithmic axis
ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line() + geom_point() + 
    theme(axis.line = element_line(colour = "black")) + scale_x_log10() + scale_y_log10()

plot of chunk unnamed-chunk-7

From library(gcookbook) I am using heightweight dataset to group data points by variables, The grouping variable must be categorical—in other words, a factor or character vector.


ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) + 
    geom_point()

plot of chunk unnamed-chunk-8

# Other shapes and color can be used by scale_shape_manual()
# scale_colour_manual()

# Change shape of points
ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point(shape = 3)

plot of chunk unnamed-chunk-8


# Change point size sex is categorical
ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex)) + geom_point(size = 3) + 
    scale_shape_manual(values = c(1, 4))

plot of chunk unnamed-chunk-8


# Represent a third continuous variable using color or size.

ggplot(heightweight, aes(x = weightLb, y = heightIn, fill = ageYear)) + geom_point(shape = 21, 
    size = 2.5) + scale_fill_gradient(low = "black", high = "white", breaks = 12:17, 
    guide = guide_legend())

plot of chunk unnamed-chunk-8

Adding Fitted Regression Model Lines

sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
sp + geom_point() + stat_smooth(method = lm)

plot of chunk unnamed-chunk-9


# Adding annotations to regression plot
model <- lm(heightIn ~ ageYear, heightweight)
summary(model)
# First generate prediction data Given a model, predict values of yvar from
# xvar This supports one predictor and one predicted variable xrange: If
# NULL, determine the x range from the model object. If a vector with two
# numbers, use those as the min and max of the prediction range.  samples:
# Number of samples across the x range.  ...: Further arguments to be passed
# to predict()
predictvals <- function(model, xvar, yvar, xrange = NULL, samples = 100, ...) {

    # If xrange isn't passed in, determine xrange from the models.  Different
    # ways of extracting the x range, depending on model type
    if (is.null(xrange)) {
        if (any(class(model) %in% c("lm", "glm"))) 
            xrange <- range(model$model[[xvar]]) else if (any(class(model) %in% "loess")) 
            xrange <- range(model$x)
    }

    newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
    names(newdata) <- xvar
    newdata[[yvar]] <- predict(model, newdata = newdata, ...)
    newdata
}


pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point() + 
    geom_line(data = pred)
sp + annotate("text", label = "r^2 == 0.42", x = 16.5, y = 52, parse = TRUE)

plot of chunk unnamed-chunk-9

Scatter plot matrix and correlation matrix using mtcars dataset and first five variables


library(corrplot)
pairs(mtcars[, 1:5])

plot of chunk unnamed-chunk-10


# Scatter plot with correlations in the upper triangle, smoothing lines in
# the lower triangle, and histograms on the diagonal
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y, use = "complete.obs"))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste(prefix, txt, sep = "")
    if (missing(cex.cor)) 
        cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * (1 + r)/2)
}
panel.hist <- function(x, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5))
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks
    nB <- length(breaks)
    y <- h$counts
    y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "white", ...)
}

pairs(mtcars[, 1:5], upper.panel = panel.cor, diag.panel = panel.hist, lower.panel = panel.smooth)

plot of chunk unnamed-chunk-10


mcor <- cor(mtcars)
corrplot(mcor)

plot of chunk unnamed-chunk-10


# Correlation matrix with colored squares and black, rotated labels
corrplot(mcor, method = "shade", shade.col = NA, tl.col = "black", tl.srt = 45)

plot of chunk unnamed-chunk-10


# create a three-dimensional (3D) scatter plot.
library(rgl)
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg, type = "s", size = 0.75, lit = FALSE)

# add vertical segments to help give a sense of the spatial positions of the
# points

interleave <- function(v1, v2) as.vector(rbind(v1, v2))
# Plot the points
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg, xlab = "Weight", ylab = "Displacement", 
    zlab = "MPG", size = 0.75, type = "s", lit = FALSE)
# Add the segments
segments3d(interleave(mtcars$wt, mtcars$wt), interleave(mtcars$disp, mtcars$disp), 
    interleave(mtcars$mpg, min(mtcars$mpg)), alpha = 0.4, col = "blue")

Scattter plot with jitter rugs,spikes and density


x <- rnorm(1000, 50, 30)
y <- 3 * x + rnorm(1000, 0, 20)
require(Hmisc)
plot(x, y)
# scat1d adds tick marks (bar codes. rug plot) on any of the four sides of
# an existing plot, corresponding with non-missing values of a vector x.
scat1d(x, col = "red")  # density bars on top of graph
scat1d(y, 4, col = "blue")  # density bars at right

plot of chunk unnamed-chunk-11


plot(x, y, pch = 19)
histSpike(x, add = TRUE, col = "green4", lwd = 2)
histSpike(y, 4, add = TRUE, col = "blue", lwd = 2)
histSpike(x, type = "density", col = "red", add = TRUE)  # smooth density at bottom
histSpike(y, 4, type = "density", col = "red", add = TRUE)

plot of chunk unnamed-chunk-11

2. Bar graphs and Histograms


barplot(BOD$demand, names.arg = BOD$Time)

plot of chunk unnamed-chunk-12

# Using the table function
barplot(table(mtcars$cyl))

plot of chunk unnamed-chunk-12


qplot(BOD$Time, BOD$demand, geom = "bar", stat = "identity")

plot of chunk unnamed-chunk-12

# Conisdering facotr
qplot(factor(BOD$Time), BOD$demand, geom = "bar", stat = "identity")

plot of chunk unnamed-chunk-12


# cyl is continuous here
qplot(mtcars$cyl)

plot of chunk unnamed-chunk-12


# Treat cyl as discrete
qplot(factor(mtcars$cyl))

plot of chunk unnamed-chunk-12


# Bar graph of values. This uses the BOD data frame, with the 'Time' column
# for x values and the 'demand' column for y values.
ggplot(BOD, aes(x = Time, y = demand)) + geom_bar(stat = "identity")

plot of chunk unnamed-chunk-12

ggplot(mtcars, aes(x = factor(cyl))) + geom_bar(fill = "white", color = "black")

plot of chunk unnamed-chunk-12


# Specify approximate number of bins with breaks
ggplot(mtcars, aes(x = mpg)) + geom_histogram(binwidth = 4, fill = "white", 
    colour = "black")

plot of chunk unnamed-chunk-12


# Change the x axis origin using origin parameter
ggplot(mtcars, aes(x = mpg)) + geom_histogram(binwidth = 4, fill = "white", 
    colour = "black", origin = 20)

plot of chunk unnamed-chunk-12

Histograms of multiple groups of data


library(MASS)
ggplot(heightweight, aes(x = heightIn)) + geom_histogram(fill = "white", colour = "black") + 
    facet_grid(sex ~ .)

plot of chunk unnamed-chunk-13

hw <- heightweight

# Using plyr and revalue() to change the names on sex variable
library(plyr)
hw$sex <- revalue(hw$sex, c(f = "Female", m = "Male"))

# Using facetting
ggplot(hw, aes(x = heightIn)) + geom_histogram(fill = "white", colour = "black") + 
    facet_grid(sex ~ .)

plot of chunk unnamed-chunk-13


ggplot(hw, aes(x = heightIn, y = ..density.., fill = sex)) + geom_histogram(position = "identity", 
    alpha = 0.4) + theme_bw() + geom_density(alpha = 0.3)

plot of chunk unnamed-chunk-13

Negative and Positive Bar plot

csub <- subset(climate, Source == "Berkeley" & Year >= 1900)
head(csub)
csub$pos <- csub$Anomaly10y >= 0
ggplot(csub, aes(x = Year, y = Anomaly10y, fill = pos)) + geom_bar(stat = "identity", 
    color = "black", position = "identity")

plot of chunk unnamed-chunk-14

Error Bar plot in ggplot2


myd <- data.frame(X = c(1:12, 1:12), Y = c(8, 12, 13, 18, 22, 16, 24, 29, 34, 
    15, 8, 6, 9, 10, 12, 18, 26, 28, 28, 30, 20, 10, 9, 9), group = rep(c("X-Group", 
    "Y-group"), each = 12), error = rep(c(2.5, 3), each = 12))

plt = ggplot(data = myd, aes(x = X, y = Y, fill = group, width = 0.8)) + geom_errorbar(aes(ymin = Y, 
    ymax = Y + error, width = 0.2), position = position_dodge(width = 0.8)) + 
    geom_bar(stat = "identity", position = position_dodge(width = 0.8)) + geom_bar(stat = "identity", 
    position = position_dodge(width = 0.8), colour = "black", legend = FALSE) + 
    scale_fill_manual(values = c("grey70", "white")) + scale_x_discrete("X", 
    limits = c(1:12)) + scale_y_continuous("Y (units)", expand = c(0, 0), limits = c(0, 
    40), breaks = seq(0, 40, by = 5)) + ggtitle("My nice plot") + theme_bw() + 
    theme(plot.title = element_text(face = "bold", size = 14), axis.title.x = element_text(face = "bold", 
        size = 12), axis.title.y = element_text(face = "bold", size = 12, angle = 90), 
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.text.y = element_text(angle = 90, hjust = 0.5), legend.title = element_blank(), 
        legend.position = c(0.85, 0.85), legend.key.size = unit(1.5, "lines"), 
        legend.key = element_rect())

plt

plot of chunk unnamed-chunk-15

3. Box plots

# Using the ToothGrowth dataset Formula syntax
boxplot(len ~ supp, data = ToothGrowth)

plot of chunk unnamed-chunk-16


# Put interaction of two variables on x-axis
boxplot(len ~ supp + dose, data = ToothGrowth)

plot of chunk unnamed-chunk-16

ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot()

plot of chunk unnamed-chunk-16


# Adding notches
ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot(notch = TRUE)

plot of chunk unnamed-chunk-16


# Adding mean
ggplot(ToothGrowth, aes(x = supp, y = len)) + geom_boxplot() + stat_summary(fun.y = "mean", 
    geom = "point", shape = 24, size = 4, fill = "white")

plot of chunk unnamed-chunk-16


# Using three separate vectors
ggplot(ToothGrowth, aes(x = interaction(supp, dose), y = len)) + geom_boxplot()

plot of chunk unnamed-chunk-16

Violin plots are a way of comparing multiple data distributions


# Use the heightweight datasets
p <- ggplot(heightweight, aes(x = sex, y = heightIn))
p + geom_violin(trim = FALSE, adjuts = 2) + geom_boxplot(width = 0.1, fill = "Grey", 
    outlier.colour = NA) + theme_bw() + stat_summary(fun.y = "mean", geom = "point", 
    shape = 24, size = 4, fill = "white")

plot of chunk unnamed-chunk-17

4. Plotting curves


curve(x^3 - 5 * x, from = -4, to = 4)

plot of chunk unnamed-chunk-18


# Plot a user-defined function
myfun <- function(xvar) {
    1/(1 + exp(-xvar + 10))
}
curve(myfun(x), from = 0, to = 20)
# Add a line:
curve(1 - myfun(x), add = TRUE, col = "red")

plot of chunk unnamed-chunk-18


# This sets the x range from 0 to 20
ggplot(data.frame(x = c(0, 20)), aes(x = x)) + stat_function(fun = myfun, geom = "line")

plot of chunk unnamed-chunk-18

5. Miscellaneous plots

Making Density Plot of Two-Dimensional Data


p <- ggplot(faithful, aes(x = eruptions, y = waiting))
p + geom_point() + stat_density2d()

plot of chunk unnamed-chunk-19

p + stat_density2d(aes(colour = ..level..))

plot of chunk unnamed-chunk-19


p + stat_density2d(aes(fill = ..density..), geom = "raster", contour = FALSE)

plot of chunk unnamed-chunk-19


# With points, and map density estimate to alpha
p + geom_point() + stat_density2d(aes(alpha = ..density..), geom = "tile", contour = FALSE)

plot of chunk unnamed-chunk-19

Plotting Pie Charts


library(RColorBrewer)
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie(slices, labels = lbls, main = "Pie Chart of Countries", col = brewer.pal(7, 
    "Set1"))

plot of chunk unnamed-chunk-20

Pie Chart with Percentages

slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pct <- round(slices/sum(slices) * 100)
lbls <- paste(lbls, pct)  # add percents to labels 
lbls <- paste(lbls, "%", sep = "")  # ad % to labels 
pie(slices, labels = lbls, col = rainbow(length(lbls)), main = "Pie Chart of US States")

plot of chunk unnamed-chunk-21

3D Pie chart


library(plotrix)
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie3D(slices, labels = lbls, explode = 0.1, main = "Pie Chart of Countries ", 
    col = brewer.pal(7, "Set1"))

plot of chunk unnamed-chunk-22

A dendrogram is the fancy word that we use to name a tree diagram to display the groups formed by hierarchical clustering.

Using Corrgrams package


library(corrgram)
R <- cor(mtcars)
# default corrgram
corrgram(R)

plot of chunk unnamed-chunk-23

# corrgram with pie charts
corrgram(R, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, 
    text.panel = panel.txt, main = "mtcars Data")

plot of chunk unnamed-chunk-23

The package ellipse provides the function plotcorr() that helps us to visualize correlations. plotcorr() uses ellipse-shaped glyphs for each entry of the correlation matrix. Here's the default plot using our matrix of R:


# default corrgram
library(ellipse)
plotcorr(R)

plot of chunk unnamed-chunk-24

# colored corrgram
plotcorr(R, col = colorRampPalette(c("firebrick3", "white", "navy"))(10))

plot of chunk unnamed-chunk-24

Another colored corrgram

plotcorr(R, col = colorRampPalette(c("#E08214", "white", "#8073AC"))(10), type = "lower")

plot of chunk unnamed-chunk-25

Visualizing Dendrograms


# prepare hierarchical cluster
hc = hclust(dist(mtcars))
plot(hc, hang = -1)  ## labels at the same level

plot of chunk unnamed-chunk-26

An alternative way to produce dendrograms is to specifically convert hclust objects into dendrograms objects.


# using dendrogram objects
hcd = as.dendrogram(hc)
# alternative way to get a dendrogram
plot(hcd)

plot of chunk unnamed-chunk-27

Having an object of class dendrogram, we can also plot the branches in a triangular form.

# using dendrogram objects
plot(hcd, type = "triangle")

plot of chunk unnamed-chunk-28

Phylogenetic trees


library(ape)
# plot basic tree
plot(as.phylo(hc), cex = 0.9, label.offset = 1)

plot of chunk unnamed-chunk-29


# fan
plot(as.phylo(hc), type = "fan")

plot of chunk unnamed-chunk-29


# add colors randomly
plot(as.phylo(hc), type = "fan", tip.color = hsv(runif(15, 0.65, 0.95), 1, 1, 
    0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20, 
    0.5, 3), use.edge.length = TRUE, col = "gray80")

plot of chunk unnamed-chunk-29

Triple heat map plot

library(reshape2)
library(grid)
library(ggplot2)

# X axis quantitaive ggplot data
datfx <- data.frame(indv = factor(paste("ID", 1:20, sep = ""), levels = rev(paste("ID", 
    1:20, sep = ""))), matrix(sample(LETTERS[1:7], 80, T), ncol = 4))
# converting data to long form for ggplot2 use
datf1x <- melt(datfx, id.var = "indv")
plotx <- ggplot(datf1x, aes(indv, variable)) + geom_tile(aes(fill = value), 
    colour = "white") + scale_fill_manual(values = terrain.colors(7)) + scale_x_discrete(expand = c(0, 
    0))
px <- plotx

# Y axis quantitaive ggplot data
datfy <- data.frame(indv = factor(paste("ID", 21:40, sep = ""), levels = rev(paste("ID", 
    21:40, sep = ""))), matrix(sample(LETTERS[7:10], 100, T), ncol = 5))
# converting data to long form for ggplot2 use
datf1y <- melt(datfy, id.var = "indv")
ploty <- ggplot(datf1y, aes(variable, indv)) + geom_tile(aes(fill = value), 
    colour = "white") + scale_fill_manual(values = c("cyan4", "midnightblue", 
    "green2", "lightgreen")) + scale_x_discrete(expand = c(0, 0))
py <- ploty + theme(legend.position = "left", axis.title = element_blank())

# plot XY quantative fill
datfxy <- data.frame(indv = factor(paste("ID", 1:20, sep = ""), levels = rev(paste("ID", 
    1:20, sep = ""))), matrix(rnorm(400, 50, 10), ncol = 20))
names(datfxy) <- c("indv", paste("ID", 21:40, sep = ""))
datfxy <- melt(datfxy, id.var = "indv")
levels(datfxy$variable) <- rev(paste("ID", 21:40, sep = ""))
pxy <- plotxy <- ggplot(datfxy, aes(indv, variable)) + geom_tile(aes(fill = value), 
    colour = "white") + scale_fill_gradient(low = "red", high = "yellow") + 
    theme(axis.title = element_blank())



# Define layout for the plots (2 rows, 2 columns)
layt <- grid.layout(nrow = 2, ncol = 2, heights = c(6/8, 2/8), widths = c(2/8, 
    6/8), default.units = c("null", "null"))
# View the layout of plots
grid.show.layout(layt)

plot of chunk unnamed-chunk-30


# Draw plots one by one in their positions
grid.newpage()
pushViewport(viewport(layout = layt))
print(py, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(pxy, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
print(px, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))

plot of chunk unnamed-chunk-30

Mosaic plot for categorical data


myd <- data.frame(fact1 = sample(c("A", "B", "C", "D"), 200, replace = TRUE), 
    fact2 = sample(c("HL", "PS", "DS"), 200, replace = TRUE), fact3 = sample(c("Male", 
        "Female"), 200, replace = TRUE))


# plot vcd package is for visualization of categorical data
require(vcd)
mytable <- table(myd)
mosaic(mytable, shade = TRUE, legend = TRUE)

plot of chunk unnamed-chunk-31

References

1.R Graphics Cookbook

2.ggplot2 book by Hadley Wickham

3.R graphs examples

4.R Graph cookbook

5.xkcd style graphs