Visualizing Data with R

Mark Sonnabaum


R


Basics - Syntax

# <- is used for assignment
a <- 10

# implicit returns
a

## [1] 10


# . is used to delimit words as opposed to _ (it's not a method call)
as.factor(c(1, 3, 4))

## [1] 1 3 4
## Levels: 1 3 4

R has many of multi-value data types


Vectors

# Create an integer vector
a <- 1:20


a/2

##  [1]  0.5  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0
## [15]  7.5  8.0  8.5  9.0  9.5 10.0

Lists

mylist <- list(blah = c(1, 2, 3), wtf = c(7, 8, 9))
mylist$blah

## [1] 1 2 3


head(mylist)

## $blah
## [1] 1 2 3
## 
## $wtf
## [1] 7 8 9

Data frames

Holds lists of vectors or factors, all of the same length.

# A data frame with a numeric vector and a character vector
df <- data.frame(first = c(1, 2, 3), second = c("foo", "bar", "baz"))

head(df)

##   first second
## 1     1    foo
## 2     2    bar
## 3     3    baz

Data frames

# Columns are accessed with '$'
df$first

## [1] 1 2 3


# Rows use the typical [] notation…
df$first[1]

## [1] 1

# …but you can do really interesting things with it

# only return items 1 and 3
df$first[c(1, 3)]

## [1] 1 3


# return all items except 1 and 3
df$first[-c(1, 3)]

## [1] 2

Data frames

# return all items where the value is greater than two
df$first[df$first > 2]

## [1] 3


# [] can be used on data frames directly like so:
df[3, 1]

## [1] 3


df[2, ]

##   first second
## 2     2    bar


# Get all rows from the data frame where the value of column 'first' is
# greater than 2
df[df$first > 2, ]

##   first second
## 3     3    baz

Iterating

# You could do something like this to build up a vector…
myvector <- vector()
for (var in df$first) {
    myvector <- c(myvector, var + 5)
}
myvector

## [1] 6 7 8


# …but the more idiomatic way would be:
myvector <- sapply(df$first, function(var) var + 5)
myvector

## [1] 6 7 8

Plotting

# Boring
plot(df)
barplot(df$first)

Plotting

# Pretty
library(ggplot2)

qplot(df$first, df$second, size = I(5))
qplot(df$second, df$first, fill = df$second, geom = "bar")

Plotting

# qplot is simple, but it's worth learning the long form because you'll
# use it more.

ggplot(df, aes(first, second, colour = second)) + geom_point(aes(alpha = 0.5), 
    size = 5)

One-dimensional data


df <- data.frame(x = 1:200, y = rnorm(200))

ggplot(df, aes(x, y)) + geom_point()

# geom_bar will produce a typical bar chart when given a factor…
ggplot(df, aes(as.factor(x), y)) + geom_bar(aes(binwidth = 10))

# …but you usually want a histogram, which it gives you otherwise
ggplot(df, aes(y)) + geom_bar(aes(fill = ..count..))

# If you did want to see the values represented horizontally, the line or
# area geoms are better suited to the task (although still not that
# useful).
ggplot(df, aes(x, y)) + geom_line()
ggplot(df, aes(x, y)) + geom_area()

Two-dimensional data


# Create some fake performance data
response.times.apc <- rnorm(200, mean = 1, sd = 0.2)
response.times.noapc <- rnorm(200, mean = 3)
time <- 1:200

df <- data.frame(time, response.times.apc, response.times.noapc)

head(df)

##   time response.times.apc response.times.noapc
## 1    1             0.8917                3.002
## 2    2             1.1290                3.513
## 3    3             1.0420                3.334
## 4    4             1.1288                3.691
## 5    5             0.5662                3.124
## 6    6             1.2843                2.412


library(reshape2)
df.new <- melt(df, id.vars = "time")
head(df.new)

##   time           variable  value
## 1    1 response.times.apc 0.8917
## 2    2 response.times.apc 1.1290
## 3    3 response.times.apc 1.0420
## 4    4 response.times.apc 1.1288
## 5    5 response.times.apc 0.5662
## 6    6 response.times.apc 1.2843

# You could look at the data as time series…
library(gridExtra)

p1 <- ggplot(df.new, aes(time, value, colour = variable)) + geom_line()
p2 <- ggplot(df.new, aes(time, value, fill = variable)) + geom_area(position = "identity", 
    aes(alpha = 0.5))

grid.arrange(p1, p2, nrow = 2)

# …but if the time isn't significant, it can be easier to visualize
# without:
ggplot(df.new, aes(value, fill = variable)) + geom_bar(position = "dodge")

# BOXPLOTS
ggplot(df.new, aes(variable, value, fill = variable)) + geom_boxplot(aes(alpha = 0.1))