Load packages
library(tidyverse)
library(gt)
library(knitr)
library(kableExtra)
library(gridExtra)
Consider hair color and test scores for 100 students.
set.seed(50) # for reproducibility
id <- 1:100 # generate sequence of whole numbers
t1_score <- round(runif(100, min = 60, max = 100), 1) # generate decimals from uniform distribution
t2_score <- sample(70:100, 100, replace = TRUE) # random sampling of integers
hair_color <- sample(c("Purple", "Green", "Blue"), 100, replace = TRUE) # random sampling of characters
df <- data.frame(id, hair_color, t1_score, t2_score) # combine variables into a dataframe
Only the first 10 shown below.
df %>%
head(10) %>%
kable() %>%
kable_styling(bootstrap_options = "striped",
full_width = FALSE,
position = "left")
id | hair_color | t1_score | t2_score |
---|---|---|---|
1 | Green | 88.3 | 71 |
2 | Blue | 77.5 | 99 |
3 | Blue | 68.0 | 92 |
4 | Purple | 90.7 | 84 |
5 | Green | 80.5 | 85 |
6 | Blue | 61.8 | 80 |
7 | Blue | 88.0 | 95 |
8 | Blue | 85.9 | 95 |
9 | Green | 61.7 | 72 |
10 | Green | 64.3 | 99 |
glimpse(df)
## Rows: 100
## Columns: 4
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ hair_color <chr> "Green", "Blue", "Blue", "Purple", "Green", "Blue", "Blue",…
## $ t1_score <dbl> 88.3, 77.5, 68.0, 90.7, 80.5, 61.8, 88.0, 85.9, 61.7, 64.3,…
## $ t2_score <int> 71, 99, 92, 84, 85, 80, 95, 95, 72, 99, 76, 77, 99, 75, 70,…
df %>% count(hair_color)
## hair_color n
## 1 Blue 31
## 2 Green 42
## 3 Purple 27
summary(df)
## id hair_color t1_score t2_score
## Min. : 1.00 Length:100 Min. :60.10 Min. : 70.00
## 1st Qu.: 25.75 Class :character 1st Qu.:70.60 1st Qu.: 75.75
## Median : 50.50 Mode :character Median :80.40 Median : 86.50
## Mean : 50.50 Mean :78.99 Mean : 85.18
## 3rd Qu.: 75.25 3rd Qu.:87.55 3rd Qu.: 94.00
## Max. :100.00 Max. :99.80 Max. :100.00
df1 <- data.frame(
Variable = colnames(df),
R = c("integer (int)", "character (chr)", "double (dbl)", "integer (int)"),
Tableau = c("numeric/dimension", "string/dimension", "numeric/measure", "numeric/measure"),
PowerBI = c("whole number", "text" , "decimal number", "whole number")
)
df1 %>%
kable() %>%
kable_styling(bootstrap_options = "striped",
position = "left",
full_width = FALSE)
Variable | R | Tableau | PowerBI |
---|---|---|---|
id | integer (int) | numeric/dimension | whole number |
hair_color | character (chr) | string/dimension | text |
t1_score | double (dbl) | numeric/measure | decimal number |
t2_score | integer (int) | numeric/measure | whole number |
df2 <- df %>%
group_by(hair_color) %>%
summarise(avg_score = round(mean((t1_score+t2_score)/2), 1))
head(df2)
## # A tibble: 3 × 2
## hair_color avg_score
## <chr> <dbl>
## 1 Blue 82.2
## 2 Green 82.3
## 3 Purple 81.6
# Set limits for the Y-axis
my_lims <- c(min(df2$avg_score - 0.1), max(df2$avg_score + 0.1))
p1 <- df2 %>%
ggplot() +
aes(x = reorder(hair_color, -avg_score), y = avg_score) +
geom_col(fill = "gray50", width = 0.6) +
coord_cartesian(ylim = my_lims) +
theme(axis.text.y = element_blank(),
axis.ticks = element_blank()) +
labs(x = "Hair color",
y = "Average test score",
title = "Figure 1")
p1
p2 <- df2 %>%
ggplot() +
aes(x = reorder(hair_color, -avg_score), y = avg_score) +
geom_col(fill = "gray50", width = 0.6) +
coord_cartesian(ylim = my_lims) +
labs(x = "Hair color",
y = "Average test score",
title = "Figure 2")
p2
p3 <- df2 %>%
ggplot() +
aes(x = reorder(hair_color, -avg_score), y = avg_score) +
geom_col(fill = "gray50", width = 0.6) +
geom_text(aes(label = round(avg_score, 1)),
size = 3.5, color = "darkred",
vjust = -0.4) +
labs(x = "Hair color",
y = "Average test score",
title = "Figure 3")
p3
p1_null <- p1 + labs(x="", y="", caption = "")
p2_null <- p2 + labs(x="", y="", caption = "")
p3_null <- p3 + labs(x="", y="", caption = "")
grid.arrange(p1_null, p2_null, p3_null,
ncol=3,
left = "Average test score")
– Theend –