Â
geom_
and stat_
functions
stat_bin
is the default stat for the geom_histogram
, geom_bar
, and geom_freqpoly
geom_smooth
calls stat_smooth
with the default arguments based on the data.geom_smooth
, like lm
.bindot
, binhex
and bin2d
and contour
# ggplot2 is already loaded
# Explore the mtcars data frame with str()
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# A scatter plot with LOESS smooth:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth()
# A scatter plot with an ordinary Least Squares linear model:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm")
# The previous plot, without CI ribbon:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se = F)
# The previous plot, without points:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_smooth(method = "lm", se = F)
# Define cyl as a factor variable
## In this ggplot command our smooth is calculated for each subgroup
## because there is an invisible aesthetic, group which inherits from col.
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = F)
# Setting the group to 1 in the aes() will override the groups
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = F) +
stat_smooth(aes(group = 1), method = "lm", se = F)
# Plot 1: change the LOESS span
## The span determines the size of the window used to fit the model
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(se = F, span = 0.7)
# Plot 2: Set the overall model to LOESS and use a span of 0.7
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = F) +
stat_smooth(
aes(group = 1),
method = "loess",
span = 0.7,
se = F,
col = "black")
# Plot 3: Set col to "All", inside the aes layer of stat_smooth()
## This adds it to the legend with label "All"
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = F) +
stat_smooth(
aes(group = 1, col = "All"),
method = "loess",
se = F,
span = 0.7)
# Plot 4: Add scale_color_manual to change the colors
myColors <- c(brewer.pal(3, "Dark2"), "black")
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = F, span = 0.75) +
stat_smooth(
aes(group = 1, col="All"),
method = "loess",
se = F,
span = 0.7) +
scale_color_manual("Cylinders", values = myColors)
# Plot 1: Jittered scatter plot, add a linear model (lm) smooth:
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.2) +
stat_smooth(method = "lm", se = F)
# Plot 2: Only lm, colored by year
ggplot(Vocab, aes(x = education, y = vocabulary, col = factor(year))) +
stat_smooth(method = "lm", se = F)
# Plot 3: Set a color brewer palette
## This will give a warning and not work becuse the default palette
## "Blues" only has 9 colors, but we have 16 years
ggplot(Vocab,
aes(x = education, y = vocabulary, col = factor(year))) +
stat_smooth(method = "lm", se = F) +
scale_color_brewer()
# Plot 4: Add the group, specify alpha and size
ggplot(Vocab,
aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_smooth( method = "lm",
se = F,
alpha = 0.6,
size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Use stat_quantile instead of stat_smooth:
## The resulting plot will be a mess, because there are three quartiles drawn by default.
ggplot(Vocab,
aes(x = education, y = vocabulary,
col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Set quantile to just 0.5:
ggplot(Vocab,
aes(x = education, y = vocabulary,
col = year, group = factor(year))) +
stat_quantile(quantiles = 0.5, alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Plot with linear and loess model
p <- ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "loess", aes(col = "red"), se = F) +
stat_smooth(method = "lm", aes(col = "blue"), se = F) +
scale_color_discrete("Model", labels = c("red" = "LOESS", "blue" = "lm"))
# Add stat_sum
p + stat_sum()
# Add stat_sum and set size range
p + stat_sum() + scale_size(range = c(1,10))
# Display structure of mtcars
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Convert cyl and am to factors:
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Define positions:
posn.d <- position_dodge(width = 0.1)
posn.jd <- position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2)
posn.j <- position_jitter(width = 0.2)
# base layers:
wt.cyl.am <- ggplot(mtcars, aes(cyl, wt, col = am, fill = am, group = am))
# wt.cyl.am, posn.d, posn.jd and posn.j are available
# Plot 1: Jittered, dodged scatter plot with transparent points
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6)
# Plot 2: Mean and SD - the easy way
wt.cyl.am +
geom_point(
position = posn.jd,
alpha = 0.6
) +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
position = posn.d
)
# Plot 3: Mean and 95% CI - the easy way
wt.cyl.am +
geom_point(
position = posn.jd,
alpha = 0.6) +
stat_summary(
fun.data = mean_cl_normal,
position = posn.d)
# Plot 4: Mean and SD - with T-tipped error bars - fill in ___
wt.cyl.am +
stat_summary(
geom = "point",
fun.y = mean,
position = posn.d) +
stat_summary(
geom = "errorbar",
fun.data = mean_sdl,
fun.args = list(mult = 1),
position = posn.d,
width = 0.1)
# Play vector xx is available
xx <- 1:100
mean_sdl(xx, mult = 1)
## y ymin ymax
## 1 50.5 21.48851 79.51149
# Function to save range for use in ggplot
gg_range <- function(x) {
data.frame(ymin = min(x), # Min
ymax = max(x)) # Max
}
gg_range(xx)
## ymin ymax
## 1 1 100
# Function to Custom function:
med_IQR <- function(x) {
data.frame(y = median(x), # Median
ymin = quantile(x)[2], # 1st quartile
ymax = quantile(x)[4]) # 3rd quartile
}
med_IQR(xx)
## y ymin ymax
## 25% 50.5 25.75 75.25
# The base ggplot command, you don't have to change this
wt.cyl.am <- ggplot(mtcars, aes(x = cyl,y = wt, col = am, fill = am, group = am))
# Add three stat_summary calls to wt.cyl.am
wt.cyl.am +
stat_summary(
geom = "linerange",
fun.data = med_IQR,
position = posn.d,
size = 3) +
stat_summary(
geom = "linerange",
fun.data = gg_range,
position = posn.d,
size = 3,
alpha = 0.4) +
stat_summary(
geom = "point",
fun.y = median,
position = posn.d,
size = 3,
col = "black",
shape = "X")
 Â
# Basic ggplot() command, coded for you
p <- ggplot(mtcars, aes(x = wt, y = hp, col = am)) + geom_point() + geom_smooth()
# Add scale_x_continuous
## The wrong way to zoom in. You lose data and the stats will dissapear or be incorrect.
p + scale_x_continuous(limits = c(3,6), expand = c(0,0))
# The proper way to zoom in:
p + coord_cartesian(xlim = c(3,6))
# Complete basic scatter plot function
base.plot <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = F)
# Plot base.plot: default aspect ratio
base.plot
# Fix aspect ratio (1:1) of base.plot
base.plot + coord_equal()
# Create stacked bar plot: thin.bar
thin.bar <- ggplot(mtcars, aes(x = 1, fill = cyl)) +
geom_bar()
thin.bar
# Convert thin.bar to pie chart
thin.bar + coord_polar(theta = "y")
# Create stacked bar plot: wide.bar
wide.bar <- ggplot(mtcars, aes(x = 1, fill = cyl)) +
geom_bar(width = 1)
wide.bar
# Convert wide.bar to pie chart
wide.bar + coord_polar(theta = "y")
# Basic scatter plot:
p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# Separate rows according to transmission type, am
p + facet_grid(am ~ .)
# Separate columns according to cylinders, cyl
p + facet_grid(. ~ cyl)
# Separate by both columns and rows
p + facet_grid(am ~ cyl)
# Code to create the cyl_am col and myCol vector
mtcars$cyl_am <- paste(mtcars$cyl, mtcars$am, sep = "_")
myCol <- rbind(brewer.pal(9, "Blues")[c(3,6,8)],
brewer.pal(9, "Reds")[c(3,6,8)])
# Basic scatter plot, add color scale:
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am)) +
geom_point() +
scale_color_manual(values = myCol)
# Facet according on rows and columns.
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am)) +
geom_point() +
scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
# Add more variables
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am, size = disp)) +
geom_point() +
scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
head(mamsleep)
## vore name sleep time
## 1 omni Owl monkey total 17.0
## 2 herbi Mountain beaver total 14.4
## 3 omni Greater short-tailed shrew total 14.9
## 4 herbi Cow total 4.0
## 5 herbi Three-toed sloth total 14.4
## 6 carni Northern fur seal total 8.7
str(mamsleep)
## 'data.frame': 112 obs. of 4 variables:
## $ vore : chr "omni" "herbi" "omni" "herbi" ...
## $ name : chr "Owl monkey" "Mountain beaver" "Greater short-tailed shrew" "Cow" ...
## $ sleep: Factor w/ 2 levels "total","rem": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 17 14.4 14.9 4 14.4 8.7 10.1 5.3 9.4 10 ...
# Basic scatter plot
ggplot(mamsleep, aes(time, name, col = sleep)) +
geom_point()
# Facet rows accoding to vore
ggplot(mamsleep, aes(time, name, col = sleep)) +
geom_point() +
facet_grid(vore ~ .)
# Specify scale and space arguments to free up rows
ggplot(mamsleep, aes(time, name, col = sleep)) +
geom_point() +
facet_grid(vore ~ ., scale = "free_y", space = "free_y")
 Â
element_text()
element_line()
element_rect()
## I have to recreate the base plot object myself
## It is already provided in the exercise.
## This is close enough to the one they have.
z <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
geom_point() +
stat_smooth(method = "lm", se = F) +
facet_grid(. ~ cyl)
myPink <- "#FEE0D2"
# Plot 1: change the plot background color to myPink:
z + theme(
plot.background = element_rect(fill = myPink))
# Plot 2: adjust the border to be a black line of size 3
z + theme(
plot.background = element_rect(
fill = myPink,
color = "black",
size = 3))
# Plot 3: set panel.background, legend.key, legend.background and strip.background to element_blank()
## You can call theme multiple times as I do here
## And setting up theme elements in advance is common
uniform_panels <- theme(panel.background = element_blank(),
legend.key = element_blank(),
legend.background=element_blank(),
strip.background = element_blank())
z <- z +
theme(
plot.background = element_rect(
fill = myPink,
color = "black",
size = 3)) +
uniform_panels
z
# Extend z with theme() function and three arguments
z <- z + theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black")
)
z
myRed <- "#99000D"
# Extend z with theme() function and four arguments
z <- z + theme(
strip.text = element_text(size = 16, color = myRed),
axis.title.y = element_text(color = myRed, hjust = 0, face = "italic"),
axis.title.x = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black")
)
z
# Move legend by position
z + theme(
legend.position = c(0.85, 0.85)
)
# Change direction
z + theme(
legend.direction = "horizontal"
)
# Change location by name
z + theme(
legend.position = "bottom"
)
# Remove legend entirely
z + theme(
legend.position = "none"
)
library(grid)
# Increase spacing between facets
z + theme(
panel.spacing.x = unit(2, "cm")
)
# Add code to remove any excess plot margin space
z + theme(
panel.spacing.x = unit(2, "cm"),
plot.margin = unit(c(0,0,0,0), "cm")
)
z2 <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
geom_point() +
stat_smooth(method = "lm", se = F) +
facet_grid(. ~ cyl)
# Theme layer saved as an object, theme_pink
theme_pink <- theme(panel.background = element_blank(),
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
plot.background = element_rect(fill = myPink, color = "black", size = 3),
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
strip.text = element_text(size = 16, color = myRed),
axis.title.y = element_text(color = myRed, hjust = 0, face = "italic"),
axis.title.x = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black"),
legend.position = "none")
# Apply theme_pink to z2
z2
z2 + theme_pink
# Change code so that old theme is saved as old
## Theme_update can be used to update a theme
## or to store the current setting in a variable for backup
old <- theme_update()
theme_update(panel.background = element_blank(),
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
plot.background = element_rect(fill = myPink, color = "black", size = 3),
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
strip.text = element_text(size = 16, color = myRed),
axis.title.y = element_text(color = myRed, hjust = 0, face = "italic"),
axis.title.x = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black"),
legend.position = "none")
# Display the plot z2
z2
# Restore the old plot
theme_set(old)
z2
# Load ggthemes package
library(ggthemes)
# Apply theme_tufte
z2 + theme_tufte()
# Apply theme_tufte, modified:
z2 + theme_tufte() +
theme(
legend.position = c(0.9, 0.9),
legend.title = element_text(face = "italic", size = 12),
axis.title = element_text(face = "bold", size = 14)
)
 Â
# Base layers
m <- ggplot(mtcars, aes(x = cyl, y = wt))
# Draw dynamite plot
m +
stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Base layers
m <- ggplot(mtcars, aes(x = cyl,y = wt, col = am, fill = am))
# Plot 1: Draw dynamite plot
m +
stat_summary(
fun.y = mean,
geom = "bar") +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "errorbar",
width = 0.1)
# Plot 2: Set position dodge in each stat function
m +
stat_summary(
fun.y = mean,
geom = "bar",
position = "dodge") +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "errorbar",
width = 0.1,
position = "dodge")
# Set your dodge posn manually
posn.d <- position_dodge(0.9)
# Plot 3: Redraw dynamite plot
m +
stat_summary(
fun.y = mean,
geom = "bar",
position = posn.d) +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "errorbar",
width = 0.1,
position = posn.d)
## Some provided summary data
mtcars.cyl
## cyl wt.avg sd n prop
## 1 4 2.285727 0.5695637 11 0.34375
## 2 6 3.117143 0.3563455 7 0.21875
## 3 8 3.999214 0.7594047 14 0.43750
# Base layers
m <- ggplot(mtcars.cyl, aes(x = cyl, y = wt.avg))
# Plot 1: Draw bar plot
m + geom_bar(stat = "identity", fill = "skyblue")
# Plot 2: Add width aesthetic
m +
geom_bar(
stat = "identity",
fill = "skyblue",
aes(width = prop))
# Plot 3: Add error bars
m +
geom_bar(
stat = "identity",
fill = "skyblue",
aes(width = prop)) +
geom_errorbar(
aes(ymin = wt.avg - sd,
ymax = wt.avg + sd),
width = 0.1)
position = 'fill'
# Convert bar chart to pie chart
ggplot(mtcars, aes(x = cyl, fill = factor(am))) +
geom_bar(position = "fill")
# Convert bar chart to pie chart
ggplot(mtcars, aes(x = factor(1), fill = factor(am))) +
geom_bar(position = "fill", width = 1) +
facet_grid(. ~ cyl) +
coord_polar(theta = "y")
# Parallel coordinates plot using GGally
library(GGally)
# All columns except am
group_by_am <- 9
my_names_am <- (1:11)[-group_by_am]
# Basic parallel plot - each variable plotted as a z-score transformation
ggparcoord(mtcars, my_names_am, groupColumn = group_by_am, alpha = 0.8)
mtcars2 <- mtcars %>%
select(mpg, disp, drat, wt, qsec)
GGally::ggpairs(mtcars2)
mtcars3 <- mtcars %>%
select(mpg, cyl, disp, hp, drat)
GGally::ggpairs(mtcars3)
cyl
increases, so to does the disp
str(barley)
## 'data.frame': 120 obs. of 4 variables:
## $ yield : num 27 48.9 27.4 39.9 33 ...
## $ variety: Factor w/ 10 levels "Svansota","No. 462",..: 3 3 3 3 3 3 7 7 7 7 ...
## $ year : Factor w/ 2 levels "1932","1931": 2 2 2 2 2 2 2 2 2 2 ...
## $ site : Factor w/ 6 levels "Grand Rapids",..: 3 6 4 5 1 2 3 6 4 5 ...
# Create color palette
myColors <- brewer.pal(9, "Reds")
# Build the heat map from scratch
ggplot(barley, aes(year, variety, fill = yield)) +
geom_tile() +
facet_wrap( ~ site, ncol = 1) +
scale_fill_gradientn(colors = myColors)
# Line plots
ggplot(barley, aes(year, yield, col = variety, group = variety)) +
geom_line() +
facet_wrap( ~ site, nrow = 1)
# Create overlapping ribbon plot from scratch
ggplot(barley, aes(year, yield, colour = site, group = site, fill = site)) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "ribbon",
col = NA,
alpha = 0.1)
 Â
RBMI
: BMI Category descriptionBMI_P
: BMI valueRACEHPR2
: raceSRSEX
: sexSRAGE_P
: ageMARIT2
: Marital statusAB1
: General Health ConditionASTCUR
: Current Asthma StatusAB51
: Type I or Type II DiabetesPOVLL
: Poverty levelsummary(adult)
## RBMI BMI_P RACEHPR2 SRSEX
## Min. :1.000 Min. :13.39 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:22.88 1st Qu.:5.000 1st Qu.:1.000
## Median :3.000 Median :25.76 Median :6.000 Median :2.000
## Mean :2.773 Mean :26.81 Mean :5.118 Mean :1.603
## 3rd Qu.:3.000 3rd Qu.:29.45 3rd Qu.:6.000 3rd Qu.:2.000
## Max. :4.000 Max. :93.72 Max. :6.000 Max. :2.000
## SRAGE_P MARIT2 AB1 ASTCUR
## Min. :18.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:45.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :58.00 Median :1.000 Median :2.000 Median :2.000
## Mean :56.95 Mean :2.027 Mean :2.519 Mean :1.916
## 3rd Qu.:70.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :85.00 Max. :4.000 Max. :5.000 Max. :2.000
## AB51 POVLL
## Min. :-1.0000 Min. :1.000
## 1st Qu.:-1.0000 1st Qu.:2.000
## Median :-1.0000 Median :4.000
## Mean :-0.6993 Mean :3.214
## 3rd Qu.:-1.0000 3rd Qu.:4.000
## Max. : 3.0000 Max. :4.000
str(adult)
## 'data.frame': 10000 obs. of 10 variables:
## $ RBMI : num 3 3 3 2 3 4 3 2 3 3 ...
## $ BMI_P : num 28.9 26.1 25.1 25 25.1 ...
## $ RACEHPR2: num 6 6 6 6 6 6 6 6 6 6 ...
## $ SRSEX : num 1 2 1 1 1 2 1 2 1 2 ...
## $ SRAGE_P : num 32 80 71 39 75 53 42 33 67 52 ...
## $ MARIT2 : num 1 3 1 4 1 1 1 1 3 3 ...
## $ AB1 : num 1 1 2 1 2 3 2 2 1 5 ...
## $ ASTCUR : num 2 2 1 2 2 1 2 2 2 2 ...
## $ AB51 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ POVLL : num 4 4 4 4 4 4 4 3 4 4 ...
# Age histogram
ggplot(adult, aes(x = SRAGE_P)) +
geom_histogram()
# BMI histogram
ggplot(adult, aes(x = BMI_P)) +
geom_histogram()
# Age colored by BMI, default binwidth
ggplot(adult, aes(x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(binwidth = 1)
# Remove individual aboves 84
adult <- adult[adult$SRAGE_P <= 84, ]
# Remove individuals with a BMI below 16 and above or equal to 52
adult <- adult[adult$BMI_P >= 16 & adult$BMI_P < 52, ]
# Relabel the race variable:
adult$RACEHPR2 <- factor(adult$RACEHPR2, labels = c("Latino", "Asian", "African American", "White"))
str(adult)
## 'data.frame': 9491 obs. of 10 variables:
## $ RBMI : num 3 3 3 2 3 4 3 2 3 3 ...
## $ BMI_P : num 28.9 26.1 25.1 25 25.1 ...
## $ RACEHPR2: Factor w/ 4 levels "Latino","Asian",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SRSEX : num 1 2 1 1 1 2 1 2 1 2 ...
## $ SRAGE_P : num 32 80 71 39 75 53 42 33 67 52 ...
## $ MARIT2 : num 1 3 1 4 1 1 1 1 3 3 ...
## $ AB1 : num 1 1 2 1 2 3 2 2 1 5 ...
## $ ASTCUR : num 2 2 1 2 2 1 2 2 2 2 ...
## $ AB51 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ POVLL : num 4 4 4 4 4 4 4 3 4 4 ...
table(adult$RACEHPR2)
##
## Latino Asian African American White
## 1356 696 476 6963
# Relabel the BMI categories variable:
adult$RBMI <- factor(adult$RBMI, labels = c("Under-weight", "Normal-weight", "Over-weight", "Obese"))
str(adult)
## 'data.frame': 9491 obs. of 10 variables:
## $ RBMI : Factor w/ 4 levels "Under-weight",..: 3 3 3 2 3 4 3 2 3 3 ...
## $ BMI_P : num 28.9 26.1 25.1 25 25.1 ...
## $ RACEHPR2: Factor w/ 4 levels "Latino","Asian",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SRSEX : num 1 2 1 1 1 2 1 2 1 2 ...
## $ SRAGE_P : num 32 80 71 39 75 53 42 33 67 52 ...
## $ MARIT2 : num 1 3 1 4 1 1 1 1 3 3 ...
## $ AB1 : num 1 1 2 1 2 3 2 2 1 5 ...
## $ ASTCUR : num 2 2 1 2 2 1 2 2 2 2 ...
## $ AB51 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ POVLL : num 4 4 4 4 4 4 4 3 4 4 ...
table(adult$RBMI)
##
## Under-weight Normal-weight Over-weight Obese
## 176 3834 3295 2186
# The dataset adult is available
# The color scale used in the plot
BMI_fill <- scale_fill_brewer("BMI Category", palette = "Reds")
# Theme to fix category display in faceted plot
fix_strips <- theme(
strip.text.y = element_text(angle = 0, hjust = 0, vjust = 0.1, size = 14),
strip.background = element_blank(),
legend.position = "none")
# Histogram, add BMI_fill and customizations
ggplot(adult, aes (x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(binwidth = 1) +
fix_strips +
BMI_fill +
facet_grid(RBMI ~ .) +
theme_classic()
# Plot 1 - Count histogram
ggplot(adult, aes (x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(binwidth = 1) +
BMI_fill
# Plot 2 - Density histogram
## This plot looks really strange, because we get the density within each BMI category, not within each age group!
ggplot(adult, aes(x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(aes(y = ..density..), binwidth = 1) +
BMI_fill
# Plot 3 - Faceted count histogram
ggplot(adult, aes (x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(binwidth = 1) +
BMI_fill +
facet_grid(RBMI ~ .)
# Plot 4 - Faceted density histogram
## Plots 3 and 4 can be useful if we are interested in the frequency distribution within each BMI category.
ggplot(adult, aes(x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(aes(y = ..density..), binwidth = 1) +
BMI_fill +
facet_grid(RBMI ~ .)
# Plot 5 - Density histogram with position = "fill"
## This is not an accurate representation, as density calculates the proportion across category, and not across bin.
ggplot(adult, aes(x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(aes(y = ..density..), binwidth = 1, position = "fill") +
BMI_fill
# Plot 6 - The accurate histogram
ggplot(adult, aes(x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 1, position = "fill") +
BMI_fill
# An attempt to facet the accurate frequency histogram from before (failed)
ggplot(adult, aes (x = SRAGE_P, fill= factor(RBMI))) +
geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 1, position = "fill") +
BMI_fill +
facet_grid(RBMI ~ .)
# Create DF with table()
DF <- table(adult$RBMI, adult$SRAGE_P)
# Use apply on DF to get frequency of each group
DF_freq <- apply(DF, 2, function(x) x/sum(x))
# Load reshape2 and use melt on DF to create DF_melted
library(reshape2)
DF_melted <- melt(DF_freq)
# Change names of DF_melted
names(DF_melted) <- c("FILL","X","value")
head(DF_melted)
## FILL X value
## 1 Under-weight 18 0.06172840
## 2 Normal-weight 18 0.61728395
## 3 Over-weight 18 0.17283951
## 4 Obese 18 0.14814815
## 5 Under-weight 19 0.02564103
## 6 Normal-weight 19 0.52564103
# Add code to make this a faceted plot
ggplot(DF_melted, aes(x = X, y = value, fill = FILL)) +
geom_bar(stat = "identity", position = "stack") +
BMI_fill +
facet_grid(FILL ~ .)
# The initial contingency table
DF <- as.data.frame.matrix(table(adult$SRAGE_P, adult$RBMI))
head(DF)
## Under-weight Normal-weight Over-weight Obese
## 18 5 50 14 12
## 19 2 41 17 18
## 20 3 29 10 8
## 21 4 39 7 11
## 22 3 26 8 11
## 23 2 25 9 10
# Add the columns groupsSum, xmax and xmin. Remove groupSum again.
DF$groupSum <- rowSums(DF)
DF$xmax <- cumsum(DF$groupSum)
DF$xmin <- DF$xmax - DF$groupSum
head(DF)
## Under-weight Normal-weight Over-weight Obese groupSum xmax xmin
## 18 5 50 14 12 81 81 0
## 19 2 41 17 18 78 159 81
## 20 3 29 10 8 50 209 159
## 21 4 39 7 11 61 270 209
## 22 3 26 8 11 48 318 270
## 23 2 25 9 10 46 364 318
# The groupSum column needs to be removed, don't remove this line
DF$groupSum <- NULL
# Copy row names to variable X
DF$X <- row.names(DF)
head(DF)
## Under-weight Normal-weight Over-weight Obese xmax xmin X
## 18 5 50 14 12 81 0 18
## 19 2 41 17 18 159 81 19
## 20 3 29 10 8 209 159 20
## 21 4 39 7 11 270 209 21
## 22 3 26 8 11 318 270 22
## 23 2 25 9 10 364 318 23
# Melt the dataset
library(reshape2)
DF_melted <- melt(DF, id.vars = c("X","xmin","xmax"), variable.name = "FILL")
head(DF_melted)
## X xmin xmax FILL value
## 1 18 0 81 Under-weight 5
## 2 19 81 159 Under-weight 2
## 3 20 159 209 Under-weight 3
## 4 21 209 270 Under-weight 4
## 5 22 270 318 Under-weight 3
## 6 23 318 364 Under-weight 2
# dplyr call to calculate ymin and ymax - don't change
library(dplyr)
DF_melted <- DF_melted %>%
group_by(X) %>%
mutate(ymax = cumsum(value/sum(value)),
ymin = ymax - value/sum(value))
head(DF_melted)
## # A tibble: 6 x 7
## # Groups: X [6]
## X xmin xmax FILL value ymax ymin
## <chr> <dbl> <dbl> <fctr> <int> <dbl> <dbl>
## 1 18 0 81 Under-weight 5 0.06172840 0
## 2 19 81 159 Under-weight 2 0.02564103 0
## 3 20 159 209 Under-weight 3 0.06000000 0
## 4 21 209 270 Under-weight 4 0.06557377 0
## 5 22 270 318 Under-weight 3 0.06250000 0
## 6 23 318 364 Under-weight 2 0.04347826 0
# Plot rectangles - don't change.
library(ggthemes)
ggplot(DF_melted, aes(ymin = ymin,
ymax = ymax,
xmin = xmin,
xmax = xmax,
fill = FILL)) +
geom_rect(colour = "white") +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
BMI_fill +
theme_tufte()
# Perform chi.sq test (RBMI and SRAGE_P)
results <- chisq.test(table(adult$RBMI, adult$SRAGE_P))
results
##
## Pearson's Chi-squared test
##
## data: table(adult$RBMI, adult$SRAGE_P)
## X-squared = 378.6, df = 198, p-value = 1.931e-13
results$residuals[,1:10]
##
## 18 19 20 21
## Under-weight 2.854106485 0.460288841 2.152649235 2.697357459
## Normal-weight 3.020707716 1.690807688 1.958493927 2.892472457
## Over-weight -2.662849666 -1.936925866 -1.766182317 -3.080781585
## Obese -1.541043156 0.008203243 -1.036134578 -0.813631157
##
## 22 23 24 25
## Under-weight 2.236349030 1.241872738 2.034722161 -1.009907720
## Normal-weight 1.501067838 1.488794558 -0.088601537 1.438842895
## Over-weight -2.122445456 -1.744109717 -0.559516661 -1.165843940
## Obese -0.016699710 -0.182759892 0.226927696 -0.187624620
##
## 26 27
## Under-weight -0.055443101 0.036373551
## Normal-weight 0.203017942 0.653249719
## Over-weight 0.047489025 -0.953875423
## Obese -0.311437552 0.295652269
# Melt results$residuals and store as resid
resid <- melt(results$residuals)
# Change names of resid
names(resid)
## [1] "Var1" "Var2" "value"
names(resid) <- c("FILL","X","residual")
names(resid)
## [1] "FILL" "X" "residual"
# merge the two datasets:
head(resid)
## FILL X residual
## 1 Under-weight 18 2.8541065
## 2 Normal-weight 18 3.0207077
## 3 Over-weight 18 -2.6628497
## 4 Obese 18 -1.5410432
## 5 Under-weight 19 0.4602888
## 6 Normal-weight 19 1.6908077
head(DF_melted)
## # A tibble: 6 x 7
## # Groups: X [6]
## X xmin xmax FILL value ymax ymin
## <chr> <dbl> <dbl> <fctr> <int> <dbl> <dbl>
## 1 18 0 81 Under-weight 5 0.06172840 0
## 2 19 81 159 Under-weight 2 0.02564103 0
## 3 20 159 209 Under-weight 3 0.06000000 0
## 4 21 209 270 Under-weight 4 0.06557377 0
## 5 22 270 318 Under-weight 3 0.06250000 0
## 6 23 318 364 Under-weight 2 0.04347826 0
DF_all <- merge(DF_melted, resid)
head(DF_all)
## X FILL xmin xmax value ymax ymin residual
## 1 18 Normal-weight 0 81 50 0.6790123 0.06172840 3.020707716
## 2 18 Obese 0 81 12 1.0000000 0.85185185 -1.541043156
## 3 18 Over-weight 0 81 14 0.8518519 0.67901235 -2.662849666
## 4 18 Under-weight 0 81 5 0.0617284 0.00000000 2.854106485
## 5 19 Normal-weight 81 159 41 0.5512821 0.02564103 1.690807688
## 6 19 Obese 81 159 18 1.0000000 0.76923077 0.008203243
# Update plot command
library(ggthemes)
ggplot(DF_all, aes(ymin = ymin,
ymax = ymax,
xmin = xmin,
xmax = xmax,
fill = residual)) +
geom_rect() +
scale_fill_gradient2() +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme_tufte()
# Position for labels on x axis
DF_all$xtext <- DF_all$xmin + (DF_all$xmax - DF_all$xmin)/2
# Position for labels on y axis (don't change)
index <- DF_all$xmax == max(DF_all$xmax)
DF_all$ytext <- DF_all$ymin[index] + (DF_all$ymax[index] - DF_all$ymin[index])/2
head(DF_all)
## X FILL xmin xmax value ymax ymin residual xtext
## 1 18 Normal-weight 0 81 50 0.6790123 0.06172840 3.020707716 40.5
## 2 18 Obese 0 81 12 1.0000000 0.85185185 -1.541043156 40.5
## 3 18 Over-weight 0 81 14 0.8518519 0.67901235 -2.662849666 40.5
## 4 18 Under-weight 0 81 5 0.0617284 0.00000000 2.854106485 40.5
## 5 19 Normal-weight 81 159 41 0.5512821 0.02564103 1.690807688 120.0
## 6 19 Obese 81 159 18 1.0000000 0.76923077 0.008203243 120.0
## ytext
## 1 0.30769231
## 2 0.96153846
## 3 0.75000000
## 4 0.01923077
## 5 0.30769231
## 6 0.96153846
# Plot
ggplot(DF_all,
aes(ymin = ymin, ymax = ymax,
xmin = xmin, xmax = xmax,
fill = residual)) +
geom_rect(col = "white") +
# geom_text for ages (i.e. the x axis)
geom_text(
aes(x = xtext,label = X),
y = 1,
size = 3,
angle = 90,
hjust = 1,
show.legend = FALSE) +
# geom_text for BMI (i.e. the fill axis)
geom_text(
aes(x = max(xmax), y = ytext, label = FILL),
size = 3,
hjust = 1,
show.legend = FALSE) +
scale_fill_gradient2() +
theme_tufte() +
theme(legend.position = "bottom")
# Load all packages
library(ggplot2)
library(reshape2)
library(dplyr)
library(ggthemes)
# Script generalized into a function
mosaicGG <- function(data, X, FILL) {
# Proportions in raw data
DF <- as.data.frame.matrix(table(data[[X]], data[[FILL]]))
DF$groupSum <- rowSums(DF)
DF$xmax <- cumsum(DF$groupSum)
DF$xmin <- DF$xmax - DF$groupSum
DF$X <- row.names(DF)
DF$groupSum <- NULL
DF_melted <- melt(DF, id = c("X", "xmin", "xmax"), variable.name = "FILL")
library(dplyr)
DF_melted <- DF_melted %>%
group_by(X) %>%
mutate(ymax = cumsum(value/sum(value)),
ymin = ymax - value/sum(value))
# Chi-sq test
results <- chisq.test(table(data[[FILL]], data[[X]])) # fill and then x
resid <- melt(results$residuals)
names(resid) <- c("FILL", "X", "residual")
# Merge data
DF_all <- merge(DF_melted, resid)
# Positions for labels
DF_all$xtext <- DF_all$xmin + (DF_all$xmax - DF_all$xmin)/2
index <- DF_all$xmax == max(DF_all$xmax)
DF_all$ytext <- DF_all$ymin[index] + (DF_all$ymax[index] - DF_all$ymin[index])/2
# plot:
g <- ggplot(DF_all, aes(ymin = ymin, ymax = ymax, xmin = xmin,
xmax = xmax, fill = residual)) +
geom_rect(col = "white") +
geom_text(aes(x = xtext, label = X),
y = 1, size = 3, angle = 90, hjust = 1, show.legend = FALSE) +
geom_text(aes(x = max(xmax), y = ytext, label = FILL),
size = 3, hjust = 1, show.legend = FALSE) +
scale_fill_gradient2("Residuals") +
scale_x_continuous("Individuals", expand = c(0,0)) +
scale_y_continuous("Proportion", expand = c(0,0)) +
theme_tufte() +
theme(legend.position = "bottom")
print(g)
}
# BMI described by age
mosaicGG(adult, "SRAGE_P", "RBMI")
# Poverty described by age
mosaicGG(adult, "SRAGE_P", "POVLL")
# mtcars: am described by cyl
mosaicGG(mtcars, "cyl", "am")
# Vocab: vocabulary described by education
library(car)
mosaicGG(Vocab, "education", "vocabulary")