Each of the following graphs uses different themes from the ggthemes package so you can get a sense for what the different themes look like. See also: https://mran.microsoft.com/snapshot/2017-02-04/web/packages/ggthemes/vignettes/ggthemes.html
For Query 1, we will be using data from the diamonds package, which comes with ggplot2.
Represent frequency distribution when have few categories. First, generating a frequency table to see how many catgories in the ‘cut’ variable of the diamonds package.
library(tidyverse)
library(ggthemes)
#using diamonds dataset that comes with ggplot2
table(diamonds$cut) # few categories, so do vertical bar chart
##
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
#a vertical freqeuncy bar chart
ggplot(diamonds, aes(x = cut)) +
geom_bar() +
labs(x = "Cut",
y = "Count") +
theme_hc() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
Represent frequency distribution when have many categories.
Note: 3 ways to re-order bar charts by count: https://www.roelpeters.be/reorder-ggplot2-bar-chart-by-count/
#a horizontal frequency bar chart
diamonds %>%
group_by(clarity) %>%
summarise(count = n()) %>%
ggplot(aes(x = reorder(clarity,(-count)), y = count)) +
geom_bar(stat = 'identity') + # stat = identity plots the values present in cells
coord_flip() + #function that turns vertical bars to horizontal bars
labs(x = "Clarity",
y = "Count") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
Represent frequency distribution for few categories.
#a vertical freqeuncy Cleveland Dot Plot
ggplot(diamonds, aes(x = cut)) +
geom_point(stat = "count", color = "dark blue", size = 5) +
labs(x = "Cut",
y = "Count") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
Represent frequency distribution for many categories. Note: 3 ways to re-order bar charts by count: https://www.roelpeters.be/reorder-ggplot2-bar-chart-by-count/
#a horizontal cleveland dot plot
diamonds %>%
group_by(clarity) %>%
summarise(count = n()) %>%
ggplot(aes(x = reorder(clarity,(-count)), y = count)) +
geom_point(stat = "identity", color = "dark blue", size = 5) +
coord_flip() + #function that turns vertical bars to horizontal bars
labs(x = "Clarity",
y = "Count") +
theme_bw() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
head(diamonds) #identify depth as a continous variable we could use here
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
summary(diamonds)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
ggplot(diamonds, aes(x = price)) +
geom_histogram(bins = 70) +
labs(x = "Price",
y = "Frequency") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(diamonds, aes(x = price)) +
geom_freqpoly(bins = 100)+
labs(x = "Price",
y = "Count") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
library(ggdist)
head(diamonds) #identify depth as a continous variable we could use here
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(diamonds, aes(x = price)) +
stat_dots(position ="dodge", quantiles = 100)+
labs(x = "Price",
y = "Count") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(diamonds, aes(x = price)) +
geom_density() +
labs(x = "Price",
y = "Density") +
theme_minimal() +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
The data for these graphs was taken from the starwars datset taht comes with dplyr. The original dataset was modified (see code below) to produce the variables graphed here.
head(starwars)
## # A tibble: 6 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywal… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
table(starwars$gender)
##
## feminine masculine
## 17 66
table(starwars$species)
##
## Aleena Besalisk Cerean Chagrian Clawdite
## 1 1 1 1 1
## Droid Dug Ewok Geonosian Gungan
## 6 1 1 1 3
## Human Hutt Iktotchi Kaleesh Kaminoan
## 35 1 1 1 2
## Kel Dor Mirialan Mon Calamari Muun Nautolan
## 1 2 1 1 1
## Neimodian Pau'an Quermian Rodian Skakoan
## 1 1 1 1 1
## Sullustan Tholothian Togruta Toong Toydarian
## 1 1 1 1 1
## Trandoshan Twi'lek Vulptereen Wookiee Xexto
## 1 2 1 2 1
## Yoda's species Zabrak
## 1 2
starwarsModified <- starwars %>%
mutate(
species2 = ifelse(species=="Human", "Human",
ifelse(species=="Droid", "Droid", "Other"))
) %>%
filter(!is.na(species) & !is.na(gender))
ggplot(starwarsModified, aes(x = species2, fill = gender)) +
geom_bar()+
labs(x = "Species",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(starwarsModified, aes(x = species2, fill = gender)) +
geom_bar(position = "fill")+
labs(x = "Species",
y = "Proportion") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(starwarsModified, aes(x = species2, fill = gender)) +
geom_bar(position = "dodge")+
labs(x = "Species",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
table(msleep$vore)
##
## carni herbi insecti omni
## 19 32 5 20
msleepModified <- msleep %>%
filter(!is.na(sleep_total) & !is.na(vore))
use <- c("carni", "herbi")
ggplot(data = subset(msleepModified, subset = vore %in% use), aes(x = sleep_total, fill = vore)) +
geom_histogram(alpha = 0.4, bins = 15) +
labs(x = "Average Amount of Sleep",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(data = subset(msleepModified, subset = vore %in% use), aes(x = sleep_total, color = vore)) +
geom_freqpoly(alpha = 0.4, bins = 15, size = 2) +
labs(x = "Average Amount of Sleep",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(data = subset(msleepModified, subset = vore %in% use), aes(x = sleep_total, fill = vore)) +
geom_density(alpha = 0.3) +
labs(x = "Average Amount of Sleep",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
ggplot(data = subset(msleepModified, subset = vore %in% use), aes(x = sleep_total, fill = vore)) +
geom_density(alpha = 0.3, aes(y = ..scaled..)) +
labs(x = "Average Amount of Sleep",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
## Warning: The dot-dot notation (`..scaled..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(scaled)` instead.
ggplot(msleepModified, aes(x = sleep_total, fill = vore)) +
geom_density(alpha = 0.3, aes(y = ..scaled..)) +
labs(x = "Average Amount of Sleep",
y = "Count") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))
library(ggridges)
##
## Attaching package: 'ggridges'
## The following objects are masked from 'package:ggdist':
##
## scale_point_color_continuous, scale_point_color_discrete,
## scale_point_colour_continuous, scale_point_colour_discrete,
## scale_point_fill_continuous, scale_point_fill_discrete,
## scale_point_size_continuous
ggplot(msleepModified, aes(x = sleep_total, y = vore, fill = vore)) +
geom_density_ridges() +
theme_ridges() +
labs(x = "Average Amount of Sleep", y = "Feeder\nType") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5)) +
theme(legend.position = "none") # supresses the legend
## Picking joint bandwidth of 2.27
ggplot(msleepModified, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin() +
labs(x = "Feeder Type", y = "Average\nAmount of\nSleep") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5)) +
theme(legend.position = "none") # supresses the legend
Also called Wilkinson dot plots or strip plots. They show every point.
ggplot(msleepModified, aes(x = vore, y = sleep_total, fill = vore)) +
geom_dotplot(binaxis = "y", binwidth = .5, stackdir = "center") +
labs(x = "Feeder Type", y = "Average\nAmount of\nSleep") +
theme_minimal(base_size = 16) +
theme(axis.title.x = element_text(face="bold", colour="#000000", size=14),
axis.text.x = element_text(size=16),
axis.text.y = element_text(angle = 0, size = 14),
axis.title.y = element_text(angle=0, size = 16, face = "bold", vjust = .5))+
theme(legend.position = "none") # supresses the legend
Histodot plots or strip plots can be very useful for visualizing hte distribution of a large number of data points and particularly useful for comparing the distributions across a number of categories. The example here uses Gapminder data from 1952 to 2007 that was already cleaned and presented at http://bit.ly/2cLzoxH.
Note in the code below, the use of geom_jitter. The jitter allows dots that would otherwise be plotted in the exact same location to be jittered slightly so that they are visible.
strip <- ggplot(df, aes(x = continent, y = lifeExp)) +
geom_jitter(position = position_jitter(height = 0, width = .25),
alpha = .17,
size = 3,
color = "lightseagreen") +
theme_minimal(base_size = 14)+
ylim(0, 90)+
scale_color_few("Medium") +
xlab("\nGeographic region")+
ylab("Life\nexpectancy\n(years)") +
theme(axis.title.y = element_text(angle = 0, vjust = .5),
legend.position = "none")
strip