if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

Data Examples

Example 1: Boston Housing Data

housing.df <- mlba::BostonHousing
head(housing.df, 9)
#>      CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO LSTAT MEDV
#> 1 0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3  4.98 24.0
#> 2 0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8  9.14 21.6
#> 3 0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8  4.03 34.7
#> 4 0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7  2.94 33.4
#> 5 0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7  5.33 36.2
#> 6 0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7  5.21 28.7
#> 7 0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 12.43 22.9
#> 8 0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 19.15 27.1
#> 9 0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 29.93 16.5
#>   CAT.MEDV
#> 1        0
#> 2        0
#> 3        1
#> 4        1
#> 5        1
#> 6        0
#> 7        0
#> 8        0
#> 9        0

Basic Charts: Bar Charts, Line Charts, and Scatter Plots

Example 2: Ridership on Amtrak Trains

# Amtrak data
library(forecast)
Amtrak.df <- mlba::Amtrak
ridership.ts <- ts(Amtrak.df$Ridership, start=c(1991, 1), end=c(2004, 3), freq=12)

# Boston housing data
housing.df <- mlba::BostonHousing
# compute mean MEDV per CHAS = (0, 1)
MEDV.per.CHAS  <- aggregate(housing.df$MEDV, by=list(housing.df$CHAS), FUN=mean)
names(MEDV.per.CHAS) <- c("CHAS", "MeanMEDV")
MEDV.per.CHAS$CHAS <- factor(MEDV.per.CHAS$CHAS)
# compute % mean CAT.MEDV
CATMEDV.per.CHAS <- aggregate(housing.df$CAT.MEDV, by=list(housing.df$CHAS), FUN=mean)
names(CATMEDV.per.CHAS) <- c("CHAS", "MeanCATMEDV")
CATMEDV.per.CHAS$CHAS <- factor(CATMEDV.per.CHAS$CHAS)
## line chart for the Amtrak data
plot(ridership.ts, xlab="Year", ylab="Ridership (in 000s)", ylim=c(1300, 2300))

## scatter plot with axes names for Boston housing data
plot(housing.df$MEDV ~ housing.df$LSTAT, xlab="LSTAT", ylab="MEDV")

## barchart of CHAS vs. mean MEDV
barplot(MEDV.per.CHAS$MeanMEDV,  names.arg=MEDV.per.CHAS$CHAS,
        xlab="CHAS", ylab="Avg. MEDV")

## barchart of CHAS vs. % CAT.MEDV
barplot(CATMEDV.per.CHAS$MeanCATMEDV * 100,  names.arg=CATMEDV.per.CHAS$CHAS,
        xlab="CHAS", ylab="% of CAT.MEDV")

library(ggplot2)
library(gridExtra)

g1 <- autoplot(ridership.ts) +
  xlab("Year") + ylab("Ridership (in 000)")

## scatter plot with axes names
g2 <- ggplot(housing.df) +
  geom_point(aes(x=LSTAT, y=MEDV), colour="navy", alpha=0.5)

g3 <- ggplot(MEDV.per.CHAS) +
  geom_bar(aes(x=CHAS, y=MeanMEDV, fill=CHAS), stat="identity")

g4 <- ggplot(CATMEDV.per.CHAS) +
  geom_bar(aes(x=CHAS, y=MeanCATMEDV, fill=CHAS), stat="identity") +
  ylab("% of CAT.MEDV")

grid.arrange(g1, g2, g3, g4, ncol=2, nrow=2)

# replace g1 graph with a real ggplot version
g1 <- ggplot(ridership.ts, aes(x=time(ridership.ts), y=ridership.ts)) +
  geom_line() + scale_x_continuous() + scale_y_continuous() +
  xlab("Year") + ylab("Ridership (in 000s)")

g <- arrangeGrob(g1 + theme_bw(),
                 g2 + theme_bw(),
                 g3 + theme_bw() + theme(legend.position="none"),
                 g4 + theme_bw() + theme(legend.position="none"),
                 ncol=2, nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_03", "basicPlots.pdf"), g)

Distribution Plots: Boxplots and Histograms

## histogram of MEDV
hist(housing.df$MEDV, xlab="MEDV")

## boxplot of MEDV for different values of CHAS
boxplot(housing.df$MEDV ~ housing.df$CHAS, xlab="CHAS", ylab="MEDV")

ggplot(housing.df) + geom_histogram(aes(x=MEDV), bins=9) + ylab("Count")

ggplot(housing.df) + geom_boxplot(aes(x=as.factor(CHAS), y=MEDV)) + xlab("CHAS")

ggplot(housing.df) + geom_histogram(aes(x=MEDV), bins=9) + ylab("Count")

ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Histogram.pdf"),
       last_plot() + theme_bw(), width=2.5, height=2, units="in")

ggplot(housing.df) + geom_boxplot(aes(x = as.factor(CHAS), y=MEDV)) + xlab("CHAS")

ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Boxplots.pdf"),
       last_plot() + theme_bw(), width=2.5, height=2, units="in")
## side-by-side boxplots
# use par() to split the plots into panels.
par(mfcol=c(1, 4))
boxplot(housing.df$NOX ~ housing.df$CAT.MEDV, xlab="CAT.MEDV", ylab="NOX")
boxplot(housing.df$LSTAT ~ housing.df$CAT.MEDV, xlab="CAT.MEDV", ylab="LSTAT")
boxplot(housing.df$PTRATIO ~ housing.df$CAT.MEDV, xlab="CAT.MEDV", ylab="PTRATIO")
boxplot(housing.df$INDUS ~ housing.df$CAT.MEDV, xlab="CAT.MEDV", ylab="INDUS")

g1 <- ggplot(housing.df, aes(x=factor(CAT.MEDV), y=NOX)) + geom_boxplot()
g2 <- ggplot(housing.df, aes(x=factor(CAT.MEDV), y=LSTAT)) + geom_boxplot()
g3 <- ggplot(housing.df, aes(x=factor(CAT.MEDV), y=PTRATIO)) + geom_boxplot()
g4 <- ggplot(housing.df, aes(x=factor(CAT.MEDV), y=INDUS)) + geom_boxplot()
grid.arrange(g1, g2, g3, g4, ncol=4, nrow=1)

g <- arrangeGrob(g1 + theme_bw() + xlab("CAT.MEDV"),
                 g2 + theme_bw() + xlab("CAT.MEDV"),
                 g3 + theme_bw() + xlab("CAT.MEDV"),
                 g4 + theme_bw() + xlab("CAT.MEDV"),
                 ncol=4, nrow=1)
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-BoxplotPanel.pdf"), g,
       width=10, height=2.5, units="in")

Heatmaps: Visualizing Correlations and Missing Values

## simple heatmap of correlations (without values)
heatmap(cor(housing.df), Rowv=NA, Colv=NA)

## heatmap with values
library(gplots)
heatmap.2(cor(housing.df), Rowv=FALSE, Colv=FALSE, dendrogram="none",
          cellnote=round(cor(housing.df),2),
          notecol="black", key=FALSE, trace="none", margins=c(10,10))

library(reshape) # to generate input for the plot
cor.mat <- round(cor(housing.df), 2) # rounded correlation matrix
melted.cor.mat <- melt(cor.mat)
ggplot(melted.cor.mat, aes(x=X1, y=X2, fill=value)) +
  geom_tile() + xlab("") + ylab("") +
  scale_fill_distiller(palette="RdBu", limits=c(-1, 1))

g <- last_plot() + theme_bw() +
  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-HeatmapCorr.pdf"),
       g, width=5.5, height=4.5, units="in")
# replace dataFrame with your data.
dataFrame = mlba::NYPDMotorVehicleCollisions

# we treat empty strings as missing values
# is.na() returns a Boolean (TRUE/FALSE) output indicating the location of missing
# values.
# multiplying the Boolean value by 1 converts the output into binary (0/1).
missing = dataFrame
missing[missing == ""] = NA
missing = 1 * is.na(missing)
heatmap(missing, Rowv=NA, Colv=NA)

library(ggplot2)
library(reshape) # to generate input for the plot
melted.missing <- melt(missing)
ggplot(melted.missing, aes(x=X1, y=X2, fill=value)) +
  geom_tile(show.legend=FALSE) +
  scale_fill_gradient(low="white", high="#666666") +
  scale_x_continuous(expand=c(0, 0))

g <- last_plot() + theme_bw() + xlab("") + ylab("") +
  theme(plot.margin=margin(r=10, t=5))
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-HeatmapMissing.pdf"),
       g, width=8, height=4.2, units="in")

Multidimensional Visualization

Adding Variables: Color, Size, Shape, Multiple Panels, and Animation

## color scatter plot
par(xpd=TRUE) # allow legend to be displayed outside of plot area
plot(housing.df$NOX ~ housing.df$LSTAT, ylab="NOX", xlab="LSTAT",
     col=ifelse(housing.df$CAT.MEDV == 1, "blue", "red"))
legend("topleft", inset=c(0, -0.2), legend=c("CAT.MEDV = 1", "CAT.MEDV = 0"),
       col=c("blue", "red"), pch=1, cex=0.5)

## panel plots
# compute mean MEDV per RAD and CHAS
# In aggregate() use argument drop = FALSE to include all combinations
# (exiting and missing) of RAD X CHAS.
data.for.plot <- aggregate(housing.df$MEDV, by=list(housing.df$RAD, housing.df$CHAS),
                           FUN=mean, drop=FALSE)
names(data.for.plot) <- c("RAD", "CHAS", "meanMEDV")
# plot the data
par(mfcol=c(2,1))
barplot(height=data.for.plot$meanMEDV[data.for.plot$CHAS == 0],
        names.arg=data.for.plot$RAD[data.for.plot$CHAS == 0],
        xlab="RAD", ylab="Avg. MEDV", main="CHAS = 0")
barplot(height=data.for.plot$meanMEDV[data.for.plot$CHAS == 1],
        names.arg=data.for.plot$RAD[data.for.plot$CHAS == 1],
        xlab="RAD", ylab="Avg. MEDV", main="CHAS = 1")

g1 <- ggplot(housing.df, aes(x=LSTAT, y=NOX, color=factor(CAT.MEDV))) +
  labs(color="CAT.MEDV") + geom_point(alpha=0.6)

g2 <- ggplot(data.for.plot) +
  geom_bar(aes(x=factor(RAD), y=meanMEDV), stat="identity") +
  labs(x="RAD", y="Avg. MEDV") +
  facet_grid(CHAS ~ .)

grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(),
                 g2 + theme_bw() + theme(legend.position="none"),
                 ncol=2, nrow=1)
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-ColorPanel.pdf"), g,
       width=8, height=3, units="in")
## simple plot
# use plot() to generate a matrix of 4X4 panels with variable name on the diagonal,
# and scatter plots in the remaining panels.
plot(housing.df[, c(1, 3, 12, 13)])

library(GGally)
ggpairs(housing.df[, c(1, 3, 12, 13)],
  lower=list(continuous=wrap("points", alpha=0.25, size=0.3)))

ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-MatrixScatterplot.pdf"),
       last_plot(), width=4, height=4, units="in")

Manipulations: Rescaling, Aggregation and Hierarchies, Zooming, Filtering

options(scipen=999) # avoid scientific notation

## scatter plot: regular and log scale
plot(housing.df$MEDV ~ housing.df$CRIM, xlab="CRIM", ylab="MEDV")

# to use logarithmic scale set argument log = to either 'x', 'y', or 'xy'.
plot(housing.df$MEDV ~ housing.df$CRIM,
    xlab="CRIM", ylab="MEDV", log="xy")

## boxplot: regular and log scale
boxplot(housing.df$CRIM ~ housing.df$CAT.MEDV,
    xlab="CAT.MEDV", ylab="CRIM")

boxplot(housing.df$CRIM ~ housing.df$CAT.MEDV,
    xlab="CAT.MEDV", ylab="CRIM", log="y")

## scatter plot: regular and log scale
# alternative log-scale plot with ggplot
g1 <- ggplot(housing.df) + geom_point(aes(x=CRIM, y=MEDV))
g2 <- g1 +
  scale_x_log10(breaks=10^(-2:2),
    labels=format(10^(-2:2), scientific=FALSE, drop0trailing=TRUE))
  scale_y_log10(breaks=c(5, 10, 20, 40))
#> <ScaleContinuousPosition>
#>  Range:  
#>  Limits:    0 --    1
## boxplot: regular and log scale
g3 <- ggplot(housing.df, aes(x=factor(CAT.MEDV), y=CRIM)) + geom_boxplot()
g4 <- g3 + scale_y_log10()

grid.arrange(g1, g2, g3, g4, ncol=2)

g <- arrangeGrob(g1+theme_bw(), g2+theme_bw(),
                 g3+theme_bw(), g4+theme_bw(),
                 ncol=2, nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-LogScale.pdf"), g,
       width=5, height=4, units="in")

Aggregation and Hierarchies

library(forecast)
Amtrak.df <- mlba::Amtrak
ridership.ts <- ts(Amtrak.df$Ridership, start=c(1991, 1), end=c(2004, 3), freq=12)
## fit curve
ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))

plot(ridership.ts, xlab="Year", ylab="Ridership (in 000s)", ylim=c(1300, 2300))
lines(ridership.lm$fitted, lwd=2)

## zoom in, monthly, and annual plots
ridership.2yrs <- window(ridership.ts, start=c(1991,1), end=c(1992,12))
plot(ridership.2yrs, xlab="Year", ylab="Ridership (in 000s)", ylim=c(1300, 2300))

monthly.ridership.ts <- tapply(ridership.ts, cycle(ridership.ts), mean)
plot(monthly.ridership.ts, xlab="Month", ylab="Average ridership",
    ylim=c(1300, 2300), type="l", xaxt="n")
## set x labels
axis(1, at=c(1:12), labels=c("Jan","Feb","Mar", "Apr","May","Jun",
                             "Jul","Aug","Sep",  "Oct","Nov","Dec"))

annual.ridership.ts <- aggregate(ridership.ts, FUN=mean)
plot(annual.ridership.ts, xlab="Year", ylab="Average ridership",
    ylim=c(1300, 2300))

g1 <- autoplot(ridership.ts) + ylim(1300, 2300) +
  labs(x="Year", y="Ridership (in 000s)") +
  geom_smooth(formula=y ~ poly(x, 2), method="lm",
              colour="navy", se=FALSE, na.rm=TRUE) +
  scale_x_continuous(n.breaks=10)

## zoom in, monthly, and annual plots
g2 <- autoplot(window(ridership.ts, start=c(1991,1), end=c(1992,12))) +
  ylim(1300, 2300) +
  labs(x="Year", y="Ridership (in 000s)")

months=c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
         "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
df <- data.frame(
  monthly.ridership=tapply(ridership.ts, cycle(ridership.ts), mean),
  month=factor(months, levels=months))
g3 <- ggplot(df, aes(x=month, y=monthly.ridership)) + ylim(1300, 2300) +
  geom_line(group=1) +
  labs(x="Month", y="Average ridership")

annual.ridership.ts <- aggregate(ridership.ts, FUN=mean)
g4 <- autoplot(annual.ridership.ts) + ylim(1300, 2300) +
  labs(x="Year", y="Average ridership")
g <- arrangeGrob(g1+theme_bw() + theme(plot.margin=margin(t=5, r=10)),
                 g3+theme_bw() + theme(plot.margin=margin(t=23, r=10)),
                 g2+theme_bw() + theme(plot.margin=margin(t=5, r=10)),
                 g4+theme_bw() + theme(plot.margin=margin(t=5, r=10)),
                 ncol=2, nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-AmtrakTimePlotTS.pdf"), g,
       width=8, height=5, units="in")

Filtering

utilities.df <- mlba::Utilities

plot(utilities.df$Fuel_Cost ~ utilities.df$Sales,
     xlab = "Sales", ylab = "Fuel cost", xlim = c(2000, 20000))
text(x = utilities.df$Sales, y = utilities.df$Fuel_Cost,
     labels = utilities.df$Company, pos = 4, cex = 0.8, srt = 20, offset = 0.2)

library(ggrepel)
ggplot(utilities.df, aes(y = Fuel_Cost, x = Sales)) +
  geom_point() +
  geom_text_repel(aes(label=Company)) +
  labs(x="Sales", y="Fuel cost")

ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-LabeledScatter.pdf"),
       last_plot() + theme_bw(), width=5, height=5, units="in")

Reference: Trend Lines and Labels

# use function alpha() in library scales to add transparent colors
library(scales)
universal.df <- mlba::UniversalBank
no_personal_loan <- subset(universal.df, Personal.Loan == 0)
personal_loan <- subset(universal.df, Personal.Loan == 1)
plot(jitter(no_personal_loan$CCAvg, 1) ~ jitter(no_personal_loan$Income, 1),
     col=alpha("lightblue", 0.5), pch=20, xlab="Income", ylab="CCAvg")
points(jitter(personal_loan$CCAvg, 1) ~ jitter(personal_loan$Income, 1),
       col="steelblue", pch=20, xlab="Income", ylab="CCAvg")

ggplot(universal.df, aes(x=Income, y=CCAvg)) +
    geom_jitter(data=no_personal_loan, width=5, height=0.2, alpha=0.5,
                color="lightblue") +
    geom_jitter(data=personal_loan, width=5, height=0.2, color="steelblue") +
    labs(colour="Personal\nLoan")

g = last_plot() + theme_bw() +
  scale_color_brewer(palette="Set1",
    guide=guide_legend(override.aes=list(size=3, alpha=1)))
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-LargeData.pdf"),
       g, width=7, height=5, units="in")

Multivariate Plot: Parallel Coordinates Plot

library(MASS)
par(mfcol = c(2,1))
parcoord(housing.df[housing.df$CAT.MEDV == 0, -14], main = "CAT.MEDV = 0")
parcoord(housing.df[housing.df$CAT.MEDV == 1, -14], main = "CAT.MEDV = 1")

library(GGally)

ggparcoord(subset(housing.df, select=-CAT.MEDV), scale="uniminmax", alpha=0.3) +
  facet_wrap(vars(housing.df$CAT.MEDV), nrow=2) + 
  labs(x="Variable", y="Value")

g = last_plot() + theme_bw() +
  facet_wrap(vars(housing.df$CAT.MEDV), nrow=2,
             labeller=as_labeller(c(`0` = "CAT.MEDV: 0", `1` = "CAT.MEDV: 1")))
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Parallel.pdf"),
       g, width=7, height=5, units="in")

Interactive Visualization

library(plotly)
g <- ggplot(housing.df) + geom_point(aes(x=CRIM, y=MEDV))
ggplotly(g)

Specialized Visualizations

Visualizing Networked Data

library(igraph)
ebay.df <- mlba::eBayNetwork

# transform node ids to factors
ebay.df[,1] <- as.factor(ebay.df[,1])
ebay.df[,2] <- as.factor(ebay.df[,2])

graph.edges <- as.matrix(ebay.df[,1:2])
g <- graph.edgelist(graph.edges, directed=FALSE)
isBuyer <- sapply(V(g)$name, function(node){ node %in% ebay.df[,2] })

plot(g, vertex.label=NA, vertex.color=ifelse(isBuyer,  "lightblue", "steelblue"),
     vertex.size=ifelse(isBuyer, 7, 10))

set.seed(1)
library(ggnetwork)
g = set_vertex_attr(g, "isBuyer", value=ifelse(isBuyer, "buyer", "seller"))
ggplot(g, aes(x=x, y=y, xend=xend, yend=yend)) +
    geom_edges(color="grey50") +
    geom_nodes(aes(color=isBuyer, size=isBuyer)) +
    scale_color_manual(values = c("buyer"="lightblue", "seller"="steelblue")) +
    scale_size_manual(values = c("buyer"=2, "seller"=5)) +
    theme_blank(legend.position="none")

ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Network.pdf"),
       last_plot(), width=4, height=4, units="in")

Visualizing Hierarchical Data: Treemaps

library(treemapify)
# load data and add column for negative feedback
tree.df <- mlba::eBayTreemap
tree.df$negative.feedback <- 1* (tree.df$Seller.Feedback < 0)
data <- aggregate(. ~ Brand + Category + Sub.Category, data=tree.df, mean)

ggplot(data, aes(area=High.Bid, subgroup=Category, fill=-negative.feedback,
                 label=Brand)) +
    geom_treemap() + geom_treemap_subgroup_border() +
    geom_treemap_subgroup_text(place="topleft", colour="black",
                               min.size=5, size=12) +
    geom_treemap_text(fontface="italic", colour="black", place="centre",
                      min.size=5, size=8) +
    scale_fill_gradient(low="#0396CC", high="#ABDCFF")

g <- last_plot() + theme_blank(legend.position="none")
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Treemap-eBay.pdf"),
       g, width=7, height=4, units="in")

Visualizing Geographical Data: Map Charts

library(ggmap)
us <- c(left=-125, bottom=24, right=-67, top=50)
SCstudents <- mlba::SCstudents
map <- get_stamenmap(us, zoom=5)
ggmap(map) +
  geom_point(aes(x=longitude, y=latitude), data=SCstudents,
    alpha=0.4, colour="red", size=0.75) +
  labs(x="Longitude", y="Latitude")

g <- last_plot()
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-GoogleMapScom.pdf"),
       g, width=7, height=4, units="in")
library(mosaic)
library(mapproj)

gdp.df <- mlba::GDP
names(gdp.df)[5] <- "GDP2015"
happiness.df <- mlba::Veerhoven

# gdp map
g1 <- mWorldMap(gdp.df, key="Country.Name", fill="GDP2015") + coord_map()

# well-being map
g2 <- mWorldMap(happiness.df, key="Nation", fill="Score") + coord_map() +
  scale_fill_continuous(name="Happiness")
grid.arrange(g1, g2, nrow=2)

g1 <- g1 + scale_fill_continuous(breaks=c(5e12, 10e12, 15e12),
                                 labels=c("5 trillion", "10 trillion", "15 trillion"))
ggsave(file=file.path(getwd(), "figures", "chapter_03", "Viz-Worldmap.pdf"),
       arrangeGrob(g1, g2, nrow=2))