Load packages

library(tidyverse)
library(gt)
library(knitr)
library(kableExtra)
library(gridExtra)

Dataset

Consider hair color and test scores for 100 students.

set.seed(50) # for reproducibility
id <- 1:100 # generate sequence of whole numbers
t1_score <- round(runif(100, min = 60, max = 100), 1) # generate decimals from uniform distribution
t2_score <- sample(70:100, 100, replace = TRUE) # random sampling of integers
hair_color <- sample(c("Purple", "Green", "Blue"), 100, replace = TRUE) # random sampling of characters
df <- data.frame(id, hair_color, t1_score, t2_score) # combine variables into a dataframe

Only the first 10 shown below.

df %>% 
  head(10) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = "striped",
                full_width = FALSE,
                position = "left")
id hair_color t1_score t2_score
1 Green 88.3 71
2 Blue 77.5 99
3 Blue 68.0 92
4 Purple 90.7 84
5 Green 80.5 85
6 Blue 61.8 80
7 Blue 88.0 95
8 Blue 85.9 95
9 Green 61.7 72
10 Green 64.3 99

Dimensions and variables

glimpse(df)
## Rows: 100
## Columns: 4
## $ id         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ hair_color <chr> "Green", "Blue", "Blue", "Purple", "Green", "Blue", "Blue",…
## $ t1_score   <dbl> 88.3, 77.5, 68.0, 90.7, 80.5, 61.8, 88.0, 85.9, 61.7, 64.3,…
## $ t2_score   <int> 71, 99, 92, 84, 85, 80, 95, 95, 72, 99, 76, 77, 99, 75, 70,…
df %>% count(hair_color)
##   hair_color  n
## 1       Blue 31
## 2      Green 42
## 3     Purple 27

Descriptive statistics

summary(df)
##        id          hair_color           t1_score        t2_score     
##  Min.   :  1.00   Length:100         Min.   :60.10   Min.   : 70.00  
##  1st Qu.: 25.75   Class :character   1st Qu.:70.60   1st Qu.: 75.75  
##  Median : 50.50   Mode  :character   Median :80.40   Median : 86.50  
##  Mean   : 50.50                      Mean   :78.99   Mean   : 85.18  
##  3rd Qu.: 75.25                      3rd Qu.:87.55   3rd Qu.: 94.00  
##  Max.   :100.00                      Max.   :99.80   Max.   :100.00

Data types in R, Tableau and PowerBI

df1 <- data.frame(
  Variable = colnames(df),
  R = c("integer (int)", "character (chr)", "double (dbl)", "integer (int)"),
  Tableau = c("numeric/dimension", "string/dimension", "numeric/measure", "numeric/measure"),
  PowerBI = c("whole number", "text" , "decimal number", "whole number")
  )
df1 %>% 
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                position = "left",
                full_width = FALSE)
Variable R Tableau PowerBI
id integer (int) numeric/dimension whole number
hair_color character (chr) string/dimension text
t1_score double (dbl) numeric/measure decimal number
t2_score integer (int) numeric/measure whole number

Compute average score for each hair color

df2 <- df %>% 
  group_by(hair_color) %>% 
  summarise(avg_score = round(mean((t1_score+t2_score)/2), 1))
head(df2)
## # A tibble: 3 × 2
##   hair_color avg_score
##   <chr>          <dbl>
## 1 Blue            82.2
## 2 Green           82.3
## 3 Purple          81.6

Truthiness: Playing with data


QUESTION: Which hair color is best for academic performance?


# Set limits for the Y-axis
my_lims <- c(min(df2$avg_score - 0.1), max(df2$avg_score + 0.1))

p1 <- df2 %>% 
  ggplot() +
  aes(x = reorder(hair_color, -avg_score), y = avg_score) +
  geom_col(fill = "gray50", width = 0.6) +
  coord_cartesian(ylim = my_lims) +
  theme(axis.text.y = element_blank(),
        axis.ticks = element_blank()) +
  labs(x = "Hair color",
       y = "Average test score",
       title = "Figure 1")
p1

p2 <- df2 %>% 
  ggplot() +
  aes(x = reorder(hair_color, -avg_score), y = avg_score) +
  geom_col(fill = "gray50", width = 0.6) +
  coord_cartesian(ylim = my_lims) +
  labs(x = "Hair color",
       y = "Average test score",
       title = "Figure 2")
p2

p3 <- df2 %>% 
  ggplot() +
  aes(x = reorder(hair_color, -avg_score), y = avg_score) +
  geom_col(fill = "gray50", width = 0.6) +
  geom_text(aes(label = round(avg_score, 1)),
            size = 3.5, color = "darkred",
            vjust = -0.4) +
  labs(x = "Hair color",
       y = "Average test score",
       title = "Figure 3")
p3

All together now! Does hair color matter?

p1_null <- p1 + labs(x="", y="", caption = "")
p2_null <- p2 + labs(x="", y="", caption = "")
p3_null <- p3 + labs(x="", y="", caption = "")
grid.arrange(p1_null, p2_null, p3_null, 
             ncol=3,
             left = "Average test score")


– Theend –