Visualizing and Analyzing Proportions

Summer 2020

Outline

Color, legends, and labeling
Basic principles of part-to-whole
Examples of basic methods to visualize proportions
Additional methods to visualize proportions
Interactive visualizations

Color, Legends, and Labeling

The Scale Function

We can use the scale_? functions to change things like:
- Legend labeling
- Color palettes
- Scaling properties of the variable
These functions generally have three parts to their name:
- ‘scale’
- The visual attribute being modified (x, y, color, fill, shape, size, etc.)
- The type of modification (continuous, discrete, manual, gradient, etc.)

scale Function Examples

For example:
- scale_x_continuous – modifies the visual element that encodes continous values by positioning along the x-axis
- scale_color_discrete – modifies the visual element that encodes discrete numeric values by assigning them a border color
- scale_fill_manual – modifies the visual element that encodes categorical values by assigning them a fill color
- scale_fill_brewer – modifies fill color using the RColorBrewer library

Cookbook Reference Websites

For information about changing legends, see: http://www.cookbook-r.com/Graphs/Legends_(ggplot2)
For general ggplot2 color information, see: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)
For more information about RColorBrewer, see: http://earlglynn.github.io/RNotes/package/RColorBrewer/index.html

Ugly Plant Growth Legends Example

Ugly Plant Growth Legends Source

library(ggplot2)

ggplot(data=PlantGrowth, aes(x=group, y=weight, fill=group)) + 
  geom_boxplot() +  
  theme(text=element_text(size=18,family="Times"))

Better Plant Growth Legends Example

Better Plant Growth Legends Source

library(ggplot2)

ggplot(data=PlantGrowth, aes(x=group, y=weight, fill=group)) + 
  geom_boxplot() +  
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"), 
                       name="Experimental\nCondition",
                       breaks=c("ctrl", "trt1", "trt2"),
                       labels=c("Control", "Treatment 1", "Treatment 2")) +
   xlab("") + ylab("Weight (g)") +
   theme(text=element_text(size=18,family="Times"))

Three Ways to Reverse Legend Labels

myMlotObj + guides(fill = guide_legend(reverse=TRUE))

myMlotObj + scale_fill_discrete(guide = guide_legend(reverse=TRUE))

myMlotObj + scale_fill_discrete(breaks = rev(levels(PlantGrowth$group)))

Line Plot Example

Line Plot Source

library(ggplot2)

df2 = read.table('http://eecs.ucf.edu/~wiegand/idc6700/datasets/color-cookbook-eg.txt', header=T)
ggplot(df2, aes(x=cond1, y=yval)) + 
    geom_line(aes(colour=cond2, group=cond2)) +
    geom_point(aes(colour=cond2), size=3)  +
    theme(text=element_text(size=18,family="Times"))

Line Plot with Manual Color Palette

## List of 1
##  $ text:List of 11
##   ..$ family       : chr "Times"
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 18
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Line Plot with Manual Color Palette Source

library(ggplot2)

df2 = read.table('http://eecs.ucf.edu/~wiegand/idc6700/datasets/color-cookbook-eg.txt', header=T)
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(df2, aes(x=cond1, y=yval)) + 
    geom_line(aes(color=cond2, group=cond2), size=1.15) +
    geom_point(aes(color=cond2), size=4)  +
    scale_color_manual(values=cbPalette,
                       breaks=c("J","I","K","L"),
                       name="Condition 2") +
    xlab("Condition 1") + ylab("y-Value")
    theme(text=element_text(size=18,family="Times"))

Line Plot Using RColorBrewer

## List of 1
##  $ text:List of 11
##   ..$ family       : chr "Times"
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 18
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Line Plot Using RColorBrewer

library(ggplot2)
library(RColorBrewer)

df2 = read.table('http://eecs.ucf.edu/~wiegand/idc6700/datasets/color-cookbook-eg.txt', header=T)
ggplot(df2, aes(x=cond1, y=yval)) + 
    geom_line(aes(color=cond2, group=cond2), size=1.15) +
    geom_point(aes(color=cond2), size=4)  +
    scale_color_brewer(palette="Set1",
                       breaks=c("J","I","K","L"),
                       name="Condition 2") +
    xlab("Condition 1") + ylab("y-Value")
    theme(text=element_text(size=18,family="Times"))

Making Your Own Gradient

Making Your Own Gradient Source

library(ggplot2)

myScatterData = data.frame(Meeples=runif(20),
                           Furples=runif(20),
                           Troddles=runif(20))

ggplot(myScatterData, aes(x=Meeples, y=Furples, fill=Troddles)) + 
    geom_point(size=4, shape=21) +
    scale_fill_gradient(low="yellow", high="red") +
    theme(text=element_text(size=18,family="Times"))

Changing the Size Scale

Changing the Size Scale Source

library(ggplot2)

myScatterData = data.frame(Meeples=runif(20),
                           Furples=runif(20),
                           Troddles=runif(20))

ggplot(myScatterData, aes(x=Meeples, y=Furples, size=Troddles)) + 
    geom_point() +
    scale_size_continuous(range=c(1,20)) +
    theme(text=element_text(size=18,family="Times"))

Basic Principles of Part-to-Whole

Part-to-Whole Patterns

There are a variety of typical part-to-whole patterns:

All values are roughly the same
Differences from one value to the next change by roughly the same amount
Differences from one value to the next decrease
Differences from one value to the next increase
Differences from one value to the next change vary significantly
Differences from one value to the next start small, become larger, then small again
One value is exceptionally larger (or smaller) than the others

Part-to-Whole and Ranking Displays

Pie charts and donut charts
Bar graphs
Dot plots
Pareto Charts

Part-to-Whole and Ranking Techniques

Consider grouping categorical items in different ways
Use percentile scales for Pareto charts
Re-express variables if there are scaling issues (e.g., use log scale)
Use line graphs to view ranking changes over time

Examples of Basic Methods

Tour of Examples

Consider the mpg dataset
In particuar: proportions for categorical data for car class and drive type
Tour methods we’ve used before to show this same data in different ways

Bar Chart, Stacked Bar Chart
Pie Chart, Donut Chart
Multivariate Fixed-Area Stacked Bar Chart Source
Multivariate, Trellised Bar Chart
Multivariate, Trellised Pie Chart

Bar Chart

Bar Chart Source

library(ggplot2)

ggplot(mpg, aes(x=class)) + 
  geom_bar() +  
  xlab("Vehicle Class") +
  ylab("Count") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Stacked Bar Chart

Stacked Bar Chart Source

library(ggplot2)
library(RColorBrewer)

numLevels = length(levels(factor(mpg$class)))
classPalette = colorRampPalette(brewer.pal(5,"Blues"))(numLevels)

ggplot(mpg, aes(x=factor(1),fill=class)) + 
  geom_bar() +  
  scale_fill_manual(values=classPalette) +
  xlab("Vehicle Class") +
  ylab("Count") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Ordered Bar Chart

Ordered Bar Chart Source

library(ggplot2)
library(dplyr)

mpg.ordered = summarise(group_by(mpg, class), Count=length(class))

ggplot(mpg.ordered, aes(x=reorder(class, -Count), y=Count)) + 
  geom_bar(stat="identity", fill="darkblue") +  
  xlab("Vehicle Class") +
  ylab("Count") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Pareto Chart

Pareto Chart Source

library(ggplot2)
library(dplyr)

totalCount = dim(mpg)[1]
mpg.ordered = summarise(group_by(mpg, class), Count=length(class))
mpg.ordered = arrange(mpg.ordered, -Count)
mpg.ordered$Percentage = 100 * mpg.ordered$Count / totalCount
mpg.ordered$Remainder = 100 * cumsum(mpg.ordered$Count) / totalCount

ggplot(mpg.ordered, aes(x=reorder(class, -Count), y=Percentage)) + 
  geom_bar(stat="identity", fill="darkgray") +  
  geom_path(aes(y=Remainder, group=1), size=1.25, color="firebrick") +
  geom_point(aes(y=Remainder), size=4) +
  xlab("Vehicle Class") +
  ylab("Percentage") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Pie Chart

Pie Chart Source

library(ggplot2)
library(RColorBrewer)

numLevels = length(levels(factor(mpg$class)))
classPalette = colorRampPalette(brewer.pal(5,"Blues"))(numLevels)

ggplot(mpg, aes(x="",fill=class)) + 
  geom_bar() +  coord_polar(theta="y") +
  scale_fill_manual(values=classPalette) +
  xlab("") +
  ylab("") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

Donut Chart

Donut Chart Source

library(ggplot2)
library(RColorBrewer)
library(reshape2)

# Create the color ramp
numLevels = length(levels(factor(mpg$class)))
classPalette = colorRampPalette(brewer.pal(5,"Blues"))(numLevels)

# Create the totals for the rectangle y positions
classCounts = mutate(melt(table(mpg$class), value.name="Count"),
                     ymax=cumsum(Count))
classCounts$ymin = c(0, head(classCounts$ymax, length(classCounts$ymax)-1))

# Plot this thing
ggplot(classCounts, aes(fill=Var1, ymin=ymin, ymax=ymax, xmin=3, xmax=4)) + 
  geom_rect() +  
  coord_polar(theta="y") +
  scale_fill_manual(values=classPalette, name="Car  Class") +
  xlim(c(0,4)) +
  xlab("") +
  ylab("") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

Multivariate Fixed-Area Stacked Bar Chart

Multivariate Fixed-Area Stacked Bar Chart Source

library(ggplot2)
library(RColorBrewer)

numLevels = length(levels(factor(mpg$class)))
classPalette = colorRampPalette(brewer.pal(5,"Blues"))(numLevels)

ggplot(mpg, aes(x=drv,fill=class)) + 
  geom_bar(position="fill") +  
  scale_fill_manual(values=classPalette) +
  xlab("Drive") +
  ylab("Ratio") +
  ggtitle("Vehicles of Different Classes & Drives, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Multivariate, Trellised Bar Chart

Multivariate, Trellised Bar Chart Source

library(ggplot2)

labels = c("4"="Four Wheel Drive", "f"="Front Wheel Drive", "r"="Rear Wheel Drive")

ggplot(mpg, aes(x=class)) + 
  geom_bar() +  coord_flip() +
  facet_grid(facets=drv ~ ., labeller=labeller(drv=labels)) +
  xlab("Vehicle Class") +
  ylab("Count") +
  ggtitle("Vehicles of Different Classes & Drives, 1999-2008") +
  theme(text=element_text(size=18,family="Times"))

Multivariate, Trellised Pie Chart

Multivariate, Trellised Pie Chart Source

library(ggplot2)
library(RColorBrewer)

numLevels = length(levels(factor(mpg$class)))
classPalette = colorRampPalette(brewer.pal(5,"Blues"))(numLevels)

labels = c("4"="Four Wheel Drive", "f"="Front Wheel Drive", "r"="Rear Wheel Drive")

ggplot(mpg, aes(x=factor(1),fill=class)) + 
  geom_bar(position="fill") +  coord_polar(theta="y") +
  facet_grid(facets=. ~ drv, labeller=labeller(drv=labels)) + 
  scale_fill_manual(values=classPalette) +
  xlab("Drive") +
  ylab("") +
  ggtitle("Vehicles of Different Classes, 1999-2008") +
  theme(text=element_text(size=18,family="Times"),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

Additional Methods

Fixed-Area, M.V. Stacked Area Chart of %’s

Fixed-Area, Multivariate Stacked Area Chart of Percentages Source

library(ggplot2)
library(RColorBrewer)
library(gcookbook) # for uspopage

ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
  geom_area(color="black", size=0.2, alpha=0.4, position="fill") +
  scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup))) +
  theme(text=element_text(size=18,family="Times")) +
  ylab("Proportion of People in Age Group") +
  ggtitle("Proportion of People of Different Ages in US, 1900-2000")

Stacked Bar Plots for Likert Data

Stacked Bar Plots for Likert Data Source, 1

library(ggplot2)
library(dplyr)
library(reshape2)
library(RColorBrewer)

# Variables to setup the filter
semester="fall"
year=2017
courseID="DIG5876"
instructorName="Rudolf Wiegand"

# Get the data
offset =c("spring"=0, "summer"=10, "fall"=20)
semesterID = 30*(year-1996)+960 + offset[semester]
dataFile = paste('Data/ucfspi',semesterID,'.data', sep='')
spi <- read.delim(dataFile, sep='#', header=F)
rpwSPI = filter(spi, InstructorName == instructorName & CourseID == courseID)

Stacked Bar Plots for Likert Data Source, 2

# Column/Variable Names
colnames(spi) <- c("College", "Department", "InstructorEmail", "InstructorName", 
                   "CourseName", "CourseID", "CourseSuffix", "SemesterID", 
                   "n",  "Organization", "Expectations", "Communicating",
                   "Respect", "Interest", "LearningEnv", "PerformanceFeedback",
                   "AchieveObjectives", "OverallEffectiveness", "NotUsed")

# Put the data in long form
rpwSPI.melt = melt(rpwSPI, id=1:9)

# We only care about the questions and make sure the responses
# are treated like an ordered factor (ordinal data), not numeric
rpwSPI.corrected = mutate(filter(rpwSPI.melt, variable != "NotUsed"), 
                          Responses = factor(value, levels=factor(5:1),
                                             ordered=T))

# Tally up all 1's, 2's, 3's, 4's, and 5's for each question
rpwSPI.counts = summarise(group_by(rpwSPI.corrected, Responses, variable),
                          Count=length(Responses))

# Get overall counts in order to obtain percentages
rpwSPI.totals = summarise(group_by(rpwSPI.corrected, variable),
                          Count=length(variable))
overallNN = max(rpwSPI.totals$Count)

Stacked Bar Plots for Likert Data Source, 3

# Setup some question labels
questions = c("Effectiveness organizing the course",
              "Effectiveness explaining course requirements,\n grading criteria, and expectations",
              "Effectiveness communicating ideas\n and/or information",
              "Effectiveness showing respect and\n concern for student",
              "Effectiveness stimulating interest\n in the course",
              "Effectiveness creating an environment\n that helps students learn",
              "Effectiveness giving useful feedback\n on course performance",
              "Effectiveness helping students\n achieve course objectives",
              "Overall effectiveness of the instructor")

# Setup legend texts and palette
responseText = c("Strongly Agree",
                 "Agree",
                 "No Opinion",
                 "Disagree",
                 "Strongly Disagree")

likertPalette = c("#336633",  # Strongly Agree, 5
                  "#99CC99",  # Agree, 4
                  "#999999",  # No Opinion, 3
                  "#FF6666",  # Disagree, 2
                  "#FF3333")  # Strongly Disagree, 1

Stacked Bar Plots for Likert Data Source, 4

# The plot!
ggplot(rpwSPI.counts, aes(x=variable, y=100*Count/overallNN, fill=Responses)) + 
  geom_bar(stat="identity") +
  geom_hline(yintercept=c(0,25,50,75,100), color="white", size=1) +
  scale_fill_manual(values=likertPalette,
                    name = "",
                    breaks = 5:1,
                    labels=responseText) +
  scale_x_discrete(labels=questions) +
  coord_flip() + 
  ylab("Percentage") +
  xlab("") +
  guides(fill = guide_legend(reverse=T)) +
  theme_bw() +
  theme(text=element_text(family="Times", size=18),
        axis.text.y = element_text(size=16, hjust=1),
        legend.position = "bottom",
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

Treemap

Treemap Source

library(portfolio)
posts <- read.csv("http://datasets.flowingdata.com/post-data.txt")
map.market(id=posts$id, 
           area=posts$views, 
           group=posts$category, 
           color=posts$comments, 
           main="FlowingData Map")

Word Cloud

Word Cloud Source

library(tm)
library(wordcloud)

rpwDissertationURL = "http://cs.ucf.edu/~wiegand/ids6938/datasets/rpw-diss.txt"
dissertation = Corpus(URISource(rpwDissertationURL))

# Take out all white space and change everything to lower case
dissertation = tm_map(dissertation,stripWhitespace)  
dissertation = tm_map(dissertation, tolower)

# Remove unnecessary and redundant words with slight suffix variations
dissertation = tm_map(dissertation, removeWords, stopwords("english"))
dissertation = tm_map(dissertation, stemDocument)

# Make sure it is really a text document
dissertation = tm_map(dissertation, PlainTextDocument) 

wordcloud(dissertation,
          scale=c(5,0.5), 
          max.words=150, 
          random.order=FALSE, 
          rot.per=0.35)

Chloropleth, v1

Chloropleth Source, v1

library(ggplot2)
library(dplyr)
library(mapproj)

# Extract reference data
mapstates <- map_data("state")

# Get the electoral data
elec2008URL = "http://www.electoral-vote.com/evp2008/Pres/Final-2008.csv"
elec2008 = mutate(read.csv(elec2008URL),
                  region=tolower(as.character(State)))

# Merge the data for the chloropleth
plotableCountyData = merge(mapstates, elec2008)

ggplot(plotableCountyData, aes(long,lat,group=group)) + 
  geom_polygon(aes(fill=Obama.Pct)) + 
  scale_fill_gradient(low="white",high="darkblue", limits=c(0,100)) +
  coord_map(project="globular") +
  geom_path(data = mapstates, colour = "white", size = .75, alpha = .1) +
  ggtitle("Who Voted for Obama in 2008?")

Chloropleth, v2

Chloropleth Source, v2

library(ggplot2)
library(dplyr)
library(mapproj)

# Extract reference data
mapstates <- map_data("state")

# Get the electoral data
elec2008URL = "http://www.electoral-vote.com/evp2008/Pres/Final-2008.csv"
elec2008 = mutate(read.csv(elec2008URL),
                  region=tolower(as.character(State)))

# Merge the data for the chloropleth
plotableCountyData = merge(mapstates, elec2008)

ggplot(plotableCountyData, aes(long,lat,group=group)) + 
  geom_polygon(aes(fill=Obama.Pct)) + 
  scale_fill_gradient(low="white",high="darkblue", limits=range(plotableCountyData$Obama.Pct)) +
  coord_map(project="globular") +
  geom_path(data = mapstates, colour = "white", size = .75, alpha = .1) +
  ggtitle("Who Voted for Obama in 2008?")

Chloropleths Are Not Heatmaps

A chloropleth is a geographic map where a numeric value is encoded as a color for a region of that map to see how a variable relates to a geography
A heat map is a tiled data visualization to help show relationships between variables
There are many visualizations that use geometric shapes and colors over regions; most of them are not heatmaps
We’ll learn more about heatmaps later