ggplot2:See my presentation slides here (right-click and open the link in a new tab).
ggplot2 in mind: layers of data + aesthetic mappings + geometriesLoad some data (simplified Ellis & Yuan 2004 data and manipulated Obarow from Larson-Hall, 2015)
ell <- read.csv("EllisYuan.csv")
obarow <- read.csv("obarow.csv")
str(obarow)
## 'data.frame': 67 obs. of 10 variables:
## $ id : int 3 2 6 7 1 10 5 8 11 14 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 2 2 1 1 ...
## $ grade : int 1 1 1 1 1 1 2 2 2 2 ...
## $ treatment: Factor w/ 4 levels "NMNP","NMYP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pretest : int 15 11 13 14 13 14 18 16 15 17 ...
## $ posttest : int 14 11 13 15 12 14 16 14 13 16 ...
## $ gain1 : int -1 0 0 1 -1 0 -2 -2 -2 -1 ...
## $ gain2 : int -2 1 0 1 0 5 4 2 1 -1 ...
## $ gain3 : int 0 0 1 3 -1 -1 0 0 1 1 ...
## $ gain4 : int 0 3 1 0 1 -1 0 0 0 1 ...
plot(posttest ~ treatment, data = obarow)
plot(posttest ~ pretest, data = obarow)
plot(treatment ~ gender, data = obarow)
The plot function will automatically generates plots suitable for the data type
ggplot2: Load the librarylibrary(ggplot2)
str(ell)
## 'data.frame': 60 obs. of 2 variables:
## $ condition: Factor w/ 3 levels "NP","OLP","PTP": 1 1 1 1 1 1 1 1 1 1 ...
## $ variety : int 20 15 12 18 9 13 14 14 25 21 ...
varietyconditionggplot(data = ell, mapping = aes(x = condition, y = variety))
geom_boxplot functionggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot()
geom_point:ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot() +
geom_point()
stat_summary We tell R to create additional data, the means and 95% CIs (i.e., “mean_cl_normal”) and represent this new data in the form of a “pointrange” (i.e., mean as a dot and CIs as extending lines).ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot() +
stat_summary(fun.data = "mean_cl_normal", # the data to plot are mean and CIs
geom = "pointrange") # use the pointrange shape
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot() +
geom_point() +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange")
Once you decide the layout and elements of your plot, you can manipuate attributes to increase the readability and explanatory power.
Option 1: boxplot with jittered data points 1-1) Jitter points with position
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot() +
geom_point(position = position_jitter()) +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange")
An alternative:
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot() +
geom_jitter() +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange")
1-2) Change the width, alpha level, size
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(outlier.size = -1) + # don't show the outliers here
geom_point(position = position_jitter(width = .2), alpha = .5, size = 2) +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange")
2-1) Modify stat_sumamry: Change the color
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(outlier.size = -1) +
geom_point(position = position_jitter(width = .2), alpha = .5) +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange",
color = "firebrick")
2-2) Nudge position
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(outlier.size = -1) +
geom_point(position = position_jitter(width = .2), alpha = .5) +
stat_summary(fun.data = "mean_cl_normal", geom = "pointrange",
color = "firebrick", position = position_nudge(x = .4))
Option 2: boxplot with dotplot
geom_dotplotggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(outlier.size = -1) +
geom_dotplot(binaxis = "y", stackdir = "center", binwidth = .5, alpha = .5)
binaxis and stackdir always have to be set like this when used with boxplotsbindiwdth to adjust the size of the dots2-1) Add statistical information: Mean
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(width = .5, outlier.size = -1) +
geom_dotplot(binwidth = .5, binaxis = "y", stackdir = "center", alpha = .5) +
stat_summary(fun.y = "mean", geom = "point", shape = 18,
color = "firebrick", size = 3)
2-2) Add statistical information: CIs in errorbars
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(width = .5, outlier.size = -1) +
geom_dotplot(binwidth = .5, binaxis = "y", stackdir = "center", alpha = .5) +
stat_summary(fun.y = "mean", geom = "point", shape = 18, size = 5,
color = "firebrick", position = position_nudge(x = .1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .1,
color = "firebrick", position = position_nudge(x = .1))
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(width = .5, outlier.size = -1) +
geom_dotplot(binwidth = .8, binaxis = "y", stackdir = "center", alpha = .5) +
stat_summary(fun.y = "mean", geom = "point", shape = 18, size = 5,
color = "firebrick", position = position_nudge(x = .1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .1,
color = "firebrick", position = position_nudge(x = .1)) +
# change the x-axis text
scale_x_discrete(labels = c("NP\n(n = 19)", "OLP\n(n = 20)", "PTP\n(n = 21)")) +
# change the y-axis scale
scale_y_continuous(limits = c(0, 35), expand = c(0, 0))
scale_x_discrete() has to do with the x-axis and works when the variable is categoricalscale_y_continuous() works with y-axis that is continuous. limits controls the y-axis limits, setting expand to c(0, 0) removes paddings that appear top and bottomggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(width = .5, outlier.size = -1) +
geom_dotplot(binwidth = .8, binaxis = "y", stackdir = "center", alpha = .5) +
stat_summary(fun.y = "mean", geom = "point", shape = 18, size = 5,
color = "firebrick", position = position_nudge(x = .1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .1,
color = "firebrick", position = position_nudge(x = .1)) +
scale_x_discrete(labels = c("NP\n(n = 19)", "OLP\n(n = 20)", "PTP\n(n = 21)")) +
scale_y_continuous(limits = c(0, 35), expand = c(0, 0)) +
# title, subtitle, axis labels, legend title
labs(title = "The Effect of Planning Time on the Amount of Syntactic Variety",
subtitle = "in three different conditions",
x = "", y = "Syntactic Variety\n")
ggplot(data = ell, mapping = aes(x = condition, y = variety)) +
geom_boxplot(width = .5, outlier.size = -1) +
geom_dotplot(binwidth = 1, binaxis = "y", stackdir = "center", alpha = .5) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .1,
color = "firebrick", position = position_nudge(x = .2)) +
stat_summary(fun.y = "mean", geom = "point", shape = 18, size = 5,
color = "firebrick", position = position_nudge(x = .2)) +
scale_x_discrete(labels = c("NP\n(n = 19)", "OLP\n(n = 20)", "PTP\n(n = 21)")) +
scale_y_continuous(limits = c(0, 35), expand = c(0, 0)) +
labs(title = "The Effect of Planning Time on the Amount of Syntactic Variety",
subtitle = "in three different conditions",
x = "", y = "Syntactic Variety\n") +
# change the overall theme (using one of the presets) and text size
theme_bw(base_size = 14) +
# change theme elements
theme(panel.grid.major.x = element_blank(), # remove grid for x-axis
axis.ticks.x = element_blank(), # remove x-axis ticks
title = element_text(size = rel(.7))) # adjust the title size
ggsave(filename = "planningtime.png")
## Saving 7 x 5 in image
# or, provide more specifications
ggsave(filename = "planningtime.jpeg", width = 6, height = 4, units = "in", dpi = 800)
ggsave saves the last plot you created in the default size with 300 dpi.filename: Include the file format extention to your file name. This is the only requirement for this function - everything else has default values. Certain image file format might not work with Mac OS (e.g., tiff).path argument).ggplot(ell, aes(x = condition, y = variety, fill = condition)) +
geom_violin(scale = "count", width = .3, alpha = .7) +
geom_boxplot(width = .1, outlier.size = -1, fill = "white") +
geom_dotplot(binwidth = .5, binaxis = "y", stackdir = "center", fill = "black", alpha = .5) +
stat_summary(fun.y = "mean", geom = "point", shape = 23, size = 4, alpha = .7) +
labs(title = "Effect of Planning Time on the Amount of Syntactic Variety",
x = "", y = "Syntactic Variety\n") +
scale_x_discrete(labels = c("NP\n(n = 19)", "OLP\n(n = 20)", "PTP\n(n = 21)")) +
scale_y_continuous(limits = c(0, 35), expand = c(0, 0)) +
scale_fill_viridis_d() +
theme_bw(base_size = 14) +
theme(legend.position = "none", panel.grid.major.x = element_blank(),
axis.ticks.x = element_blank())
str(obarow)
## 'data.frame': 67 obs. of 10 variables:
## $ id : int 3 2 6 7 1 10 5 8 11 14 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 2 2 1 1 ...
## $ grade : int 1 1 1 1 1 1 2 2 2 2 ...
## $ treatment: Factor w/ 4 levels "NMNP","NMYP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pretest : int 15 11 13 14 13 14 18 16 15 17 ...
## $ posttest : int 14 11 13 15 12 14 16 14 13 16 ...
## $ gain1 : int -1 0 0 1 -1 0 -2 -2 -2 -1 ...
## $ gain2 : int -2 1 0 1 0 5 4 2 1 -1 ...
## $ gain3 : int 0 0 1 3 -1 -1 0 0 1 1 ...
## $ gain4 : int 0 3 1 0 1 -1 0 0 0 1 ...
Let’s visualize the correlation between pretest and posttest
ggplot(data = obarow, mapping = aes(x = pretest, y = posttest))
Visualize data points:
ggplot(obarow, aes(x = pretest, y = posttest)) +
geom_point()
Add a line that fits the data using stat_smooth function:
ggplot(obarow, aes(x = pretest, y = posttest)) +
geom_point() +
stat_smooth() # default is loess line
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Specify which method to use:
ggplot(obarow, aes(x = pretest, y = posttest)) +
geom_point() +
stat_smooth(method = "lm")
Jitter the dots:
ggplot(obarow, aes(x = pretest, y = posttest)) +
geom_point(position = position_jitter()) +
stat_smooth(method = "lm")
Modify the trend line:
ggplot(obarow, aes(x = pretest, y = posttest)) +
geom_point(position = position_jitter()) +
stat_smooth(method = "lm", se = FALSE, lty = "dashed", color = "firebrick")
Reading and writing data (fake data based on real data)
rw <- read.csv("readwrite2.csv")
str(rw)
## 'data.frame': 1997 obs. of 4 variables:
## $ id : int 3070 1306 83 2486 1938 397 977 1322 2414 156 ...
## $ reading: int 18 20 14 16 14 20 20 16 21 21 ...
## $ writing: int 21 15 14 20 15 18 9 14 15 24 ...
## $ level : int 3 3 2 4 2 4 2 1 3 5 ...
We have three variables: reading score = reading, writing score = wrting, group = level
Let’s see if there’s any correlation between reading and writing scores
ggplot(rw, aes(x = reading, y = writing)) +
geom_point(position = position_jitter())
ggplot(rw, aes(x = reading, y = writing)) +
geom_point(position = position_jitter()) +
stat_smooth(method = "lm")
Add proficiency level to the aesthetics:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter())
Change the level variable to factor and provide labels:
rw$level <- factor(rw$level, labels = c("Novice", "Low-IM", "High-IM", "Advanced", "Superior"))
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3)
Fitting trend lines:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3) +
stat_smooth(method = "lm", se = FALSE)
This will fit a line for each group because we have specified group (level)
To fit a line for overall data, but still keep the group colors, move the color aesthetic to lower level function
ggplot(rw, aes(x = reading, y = writing)) +
geom_point(aes(color = level), position = position_jitter(), alpha = .3) +
stat_smooth(method = "lm", se = FALSE)
Notice Simpson’s paradox here. When you fit an overall trend line, it looks like there is a positive correlation between reading and writing scores, but if you look at proficiency groups, there is no correlation or negative correlation. In this case, the trend seems that the scores increase in general as proficiency advances.
Color scale
Using viridis scheme:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3, size = 1) +
stat_smooth(method = "lm", se = FALSE) +
scale_color_viridis_d()
Using color brewer palettes:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3, size = 1) +
stat_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1")
Axis scale
Change the x- and y-axes:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3, size = 2) +
stat_smooth(method = "lm", se = FALSE) +
scale_color_viridis_d() +
scale_x_continuous(limits = c(0, 30), expand = c(0, 0)) +
scale_y_continuous(limits = c(0, 30), expand = c(0, 0))
Use coordinate that applies 1:1 scale to be accurate with the visuals:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3, size = 2) +
stat_smooth(method = "lm", se = FALSE) +
scale_color_viridis_d() +
scale_x_continuous(limits = c(0, 30), expand = c(0, 0)) +
scale_y_continuous(limits = c(0, 30), expand = c(0, 0)) +
coord_equal()
This code is also possible:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(), alpha = .3, size = 2) +
stat_smooth(method = "lm", se = FALSE) +
scale_color_viridis_d() +
# specify the x- and y-axis limits
coord_equal(xlim = c(0, 30), ylim = c(0, 30), expand = FALSE)
Final, polished version:
ggplot(rw, aes(x = reading, y = writing, color = level)) +
geom_point(position = position_jitter(),
alpha = .3, size = 1) +
stat_smooth(method = "lm") +
stat_smooth(method = "lm", color = "black", se = FALSE) +
scale_color_viridis_d() +
coord_equal(xlim = c(0, 30), ylim = c(0, 30), expand = FALSE) +
labs(x = "\nReading score", y = "Writing score\n",
title = "Correlation between reading and writing scores", color = "Proficiency level") +
theme_bw()
ggsave("correlation.png", width = 6, height = 5, dpi = 600)
Facet: Create subplots
ggplot(rw, aes(x = reading, y = writing)) +
geom_point(position = position_jitter(), alpha = .1) +
stat_smooth(method = "lm") +
coord_equal() +
facet_wrap(. ~ level, nrow = 1) +
labs(title = "Relationship between reading and writings score by proficiency level",
x = "Reading score", y = "Writing score\n") +
theme_bw()
Creating parallel coordinate plot for pre- and post-tests
str(obarow)
## 'data.frame': 67 obs. of 10 variables:
## $ id : int 3 2 6 7 1 10 5 8 11 14 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 2 2 1 1 ...
## $ grade : int 1 1 1 1 1 1 2 2 2 2 ...
## $ treatment: Factor w/ 4 levels "NMNP","NMYP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pretest : int 15 11 13 14 13 14 18 16 15 17 ...
## $ posttest : int 14 11 13 15 12 14 16 14 13 16 ...
## $ gain1 : int -1 0 0 1 -1 0 -2 -2 -2 -1 ...
## $ gain2 : int -2 1 0 1 0 5 4 2 1 -1 ...
## $ gain3 : int 0 0 1 3 -1 -1 0 0 1 1 ...
## $ gain4 : int 0 3 1 0 1 -1 0 0 0 1 ...
pretest and … posttest?treatmentThe two different tests are repeated measures of one variable: test score
Reshape the data so that:
scoretreatment & testTransform the data
ob2 <- tidyr::gather(data = obarow, key = test, value = score, pretest, posttest)
key: the new variable with categories (i.e., the two types of tests)value: the values associated with the new categorical variable (i.e., test score)str(ob2)
## 'data.frame': 134 obs. of 10 variables:
## $ id : int 3 2 6 7 1 10 5 8 11 14 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 2 2 1 1 ...
## $ grade : int 1 1 1 1 1 1 2 2 2 2 ...
## $ treatment: Factor w/ 4 levels "NMNP","NMYP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gain1 : int -1 0 0 1 -1 0 -2 -2 -2 -1 ...
## $ gain2 : int -2 1 0 1 0 5 4 2 1 -1 ...
## $ gain3 : int 0 0 1 3 -1 -1 0 0 1 1 ...
## $ gain4 : int 0 3 1 0 1 -1 0 0 0 1 ...
## $ test : chr "pretest" "pretest" "pretest" "pretest" ...
## $ score : int 15 11 13 14 13 14 18 16 15 17 ...
What are we mapping?
ggplot(data = ob2, mapping = aes(x = test, y = score))
Is there anything that appears to be wrong with the coordinates? Yes, because the levels of test is arranged alphabetically, posttest appears before pretest.
Specify the order the factor levels:
ob2$test <- factor(ob2$test, levels = c("pretest", "posttest"))
ggplot(data = ob2, mapping = aes(x = test, y = score))
Now it looks right. Same for any discrete levels of a given variable (treatment groups, etc.)
ggplot(ob2, aes(x = test, y = score)) +
geom_line()
What went wrong?
geom_line() is just connecting data points within each test category.Group the scores for each individual by assigning the id variable to group. Either of these works:
ggplot(ob2, aes(x = test, y = score, group = id)) +
geom_line()
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id))
This means, you need a variable called ID for creating this type of graphs. If you don’t have one create it in the original dataset before transforming the data into a long form.
Just as we provided the id variable to group argument, we need to group the data by the treatment groups to plot the group means.
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id)) +
stat_summary(aes(group = treatment), fun.y = "mean", geom = "line", size = 2)
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id)) +
stat_summary(aes(group = treatment), fun.y = "mean", geom = "line", size = 2) +
stat_summary(aes(group = treatment), fun.data = "mean_cl_normal",
geom = "pointrange", size = 1)
by treatment group:
ggplot(ob2, aes(x = test, y = score, color = treatment)) +
geom_line(aes(group = id)) +
stat_summary(aes(group = treatment), fun.y = "mean", geom = "line", size = 2)
Not necessary because it is hard to decode each line.
just for the mean:
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id)) +
stat_summary(aes(group = treatment, color = treatment),
fun.y = "mean", geom = "line", size = 2)
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id), color = "grey70") +
stat_summary(aes(group = treatment, color = treatment), fun.y = "mean",
geom = "line", size = 2, position = position_dodge(.1)) +
stat_summary(aes(group = treatment), fun.data = "mean_cl_normal",
geom = "pointrange", size = 1, position = position_dodge(.1)) +
scale_color_brewer(palette = "Dark2")
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id), color = "grey70") +
stat_summary(aes(group = treatment, color = treatment), fun.y = "mean",
geom = "line", size = 2, position = position_dodge(.1)) +
stat_summary(aes(group = treatment), fun.data = "mean_cl_normal",
geom = "pointrange", size = 1, position = position_dodge(.1)) +
scale_color_brewer(palette = "Dark2") +
scale_x_discrete(expand = c(.2, .2), labels = c("Pre-test", "Post-test")) +
scale_y_continuous(limits = c(0, 30)) +
labs(x = "", y = "Vocabulary Score\n", title = "Score Changes after Treatment") +
theme_bw() +
theme(legend.position = "none", panel.grid.major.x = element_blank(),
axis.ticks.x = element_blank())
Alternative: Faceted graphics
ggplot(ob2, aes(x = test, y = score)) +
geom_line(aes(group = id), color = "grey70") +
stat_summary(aes(group = treatment), fun.y = "mean", geom = "line", size = 2) +
stat_summary(aes(group = treatment), fun.data = "mean_cl_normal",
geom = "pointrange", size = 1) +
scale_x_discrete(expand = c(.2, .2), labels = c("Pre-test", "Post-test")) +
scale_y_continuous(limits = c(0, 30)) +
labs(x = "", y = "Vocabulary Score\n", title = "Score Changes after Treatment") +
facet_grid(. ~ treatment) +
theme_bw() +
theme(panel.grid.major.x = element_blank())
Use differen color lines for individuals who had positive gain scores after treatment.
ggplot(ob2, aes(x = test, y = score)) +
geom_line(data = dplyr::filter(ob2, gain1 > 0), aes(group = id), color = "forestgreen") +
geom_line(data = dplyr::filter(ob2, gain1 <= 0), aes(group = id), color = "grey70") +
stat_summary(aes(group = treatment), fun.y = "mean", geom = "line", size = 2) +
stat_summary(aes(group = treatment), fun.data = "mean_cl_normal",
geom = "pointrange", size = 1) +
scale_y_continuous(limits = c(0, 30)) +
labs(x = "", y = "Vocabulary Score\n", title = "Score Changes after Treatment") +
facet_grid(. ~ treatment) +
theme_bw() +
theme(panel.grid.major.x = element_blank())