knitr::opts_chunk$set(echo = TRUE,
                      fig.align = "center")

# Load the tidyverse and gt packages
pacman::p_load(tidyverse, gt)

The supers2.csv file has 12 variables on 1899 superheroes from Marvel and DC Comics. We will focus on 3 variables:

Question 0: Reading the data in

Read in the “supers2.csv” data set and save it as a global object named supers. After you read it in, change the order of the alignment groups to be Good, Neutral, Bad. To confirm it is done correctly, use levels(supers$Alignment)

# Reading in the csv file
supers <- read.csv("supers2.csv")

# Changing the order of the levels
supers$Alignment <- 
  factor(supers$Alignment,
         levels = c("Good", "Neutral", "Bad"))

levels(supers$Alignment)
## [1] "Good"    "Neutral" "Bad"

Question 1: Displays for Single Variables

For each of the 3 variables in the data:

i) Create a table for the groups of the variable that includes both the counts and percentages (rounded to 1 decimal places)

ii) A bar chart - Choose a suitable theme for the bar charts (don’t just go with the default option, try to make the graphs look nice!)

For all bar charts, add scale_y_continuous(expand = c(0, 0, 0.05, 0)) to the graph to remove the extra space at the bottom of the graph

Part 1A: Alignment

1A i)

# Forming the table for the counts and proportions
align_tab <- 
  supers |> 
  # Counting how many rows are in good, neutral, and bad
  count(Alignment) |> 
  # Calculating the proportions
  mutate(
    proportion = n / sum(n),
    percentage = paste0(round(proportion*100, digits = 1), "%")
  )



gt(align_tab)
Alignment n proportion percentage
Good 727 0.3828331 38.3%
Neutral 579 0.3048973 30.5%
Bad 593 0.3122696 31.2%

1A ii)

Create the bar chart using the supers data frame (not the table created in part 1i). Display the counts on the y-axis. Add a title that says “Comics Superhero Alignments” and remove the label on the x-axis

ggplot(
  data = supers,
  mapping = aes(x = Alignment)
) + 
  
  # Creating a bar chart using geom_bar() and using fill to change the color
  geom_bar(
    fill = "forestgreen",
    color = "black"
  ) + 
  
  # Changing the label on the x-axis
  labs(
    title = "Comics Superhero Alignments",
    x = NULL
  ) + 
  
  # Changing the theme
  theme_classic() +
  
  # Adding the code chunk specified at the beginning of the question:
  scale_y_continuous(expand = c(0, 0, 0.05, 0))

Part 1B: Eye Color

1B i) Table

eye_tab <- 
  supers |> 
  # Counting the number of supers for each eye color
  count(eye) |> 
  # Calculating the proportion and percentage
  mutate(
    eye_prop = n / sum(n),
    eye_perc = paste0(round(eye_prop * 100, 1), "%")
  )

# Display, but don't save, the data.frame (table) using gt() from the gt() package
gt(eye_tab)
eye n eye_prop eye_perc
Blue 377 0.19852554 19.9%
Brown 348 0.18325434 18.3%
Green 124 0.06529753 6.5%
None 686 0.36124276 36.1%
Other 227 0.11953660 12%
Red 137 0.07214323 7.2%

1B ii) Bar Chart

Using the data frame created in 1B i), create a bar chart that displays the percentages on the y-axis. Make sure to choose appropriate colors for each of the bars! Make sure to add a title and remove the labels on the x-axis

# Need to specify what y is if we want the proportions on the y-axis
# which requires us to use the summarized data
ggplot(
  data = eye_tab,
  mapping = aes(
    x = eye,
    y = eye_prop
  )
) + 
  
  # Creating a bar chart using geom_col() since we need to specify the bar heights
  geom_col(
    fill = c("blue", "tan4", "forestgreen", "white", "black", "red"),
    color = "black"
  ) + 
  
  # Changing the label on the x-axis
  labs(
    x = NULL,
    title = "Comic Superhero Eye Colors"
  ) + 
  
  # Changing the theme
  theme_classic() +
  
  # Adding the code chunk specified at the beginning of the question:
  scale_y_continuous(
    expand = c(0, 0, 0.05, 0),
    labels = scales::label_percent()
  )

Part 1C: Hair Color

1C i) Table for Hair Color

# Creating the table of counts for hair color
hair_tab <- 
  supers |> 
  # Counting the number of supers for each hair color
  count(hair) |> 
  # Calculating the proportion and percentage
  mutate(
    hair_prop = n / sum(n),
    hair_perc = paste0(round(hair_prop * 100, 1), "%")
  )

# Using gt() to display the results
gt(hair_tab)
hair n hair_prop hair_perc
Black 354 0.1864139 18.6%
Blond 209 0.1100579 11%
Brown 194 0.1021590 10.2%
None 890 0.4686677 46.9%
Other 252 0.1327014 13.3%

1C ii) Hair Bar Chart

Create a bar chart that displays the counts on the x-axis and the hair colors on the y-axis. Make sure to choose appropriate colors for each of the bars! Add a title and remove the labels on the y-axis

# Need to specify what y is if we want the proportions on the y-axis
# which requires us to use the summarized data
ggplot(
  data = supers,
  mapping = aes(y = hair)
) + 
  
  # Creating a bar chart using geom_col() since we need to specify the bar heights
  geom_bar(
    fill = c("black", "yellow2", "tan4", "white", "violet"),
    color = "black"
  ) + 
  
  # Changing the label on the x-axis
  labs(
    y = NULL,
    title = "Comic Superheroes Hair Colors"
  ) + 
  
  # Changing the theme
  theme_classic() +
  
  scale_x_continuous(
    expand = c(0, 0, 0.05, 0)
  )

Question 2: Alignment Percentages by Eye and Hair Color

Using the other 2 variables (hair and eye color) individually, calculate the percentages of supers that are Good, Neutral, and Bad.

i) Present the percentages in a table, rounding to 1 decimal place. You can use either a contingency (two-way) table or by converting the table to a data.frame()

ii) Present the proportions or percentages in the specified bar chart - Have the colors for Good, Neutral, and Bad be “steelblue”, “grey70”, and “tomato”, respectively.

Keep adding scale_y_continuous(expand = c(0, 0, 0.05, 0)) to the bar charts to remove the extra space!

Part 2A: Eye Color

2A i)

align_eye_df <- 
  supers |> 
  # Getting the combination of alignment and eye color
  count(eye, Alignment) |> 
  # Calculating the alignment prop by eye color group
  mutate(
    .by = eye,
    prop = n / sum(n),
    percent = round(prop*100, 1)
  )

gt(align_eye_df)
eye Alignment n prop percent
Blue Good 228 0.6047745 60.5
Blue Neutral 43 0.1140584 11.4
Blue Bad 106 0.2811671 28.1
Brown Good 176 0.5057471 50.6
Brown Neutral 66 0.1896552 19.0
Brown Bad 106 0.3045977 30.5
Green Good 59 0.4758065 47.6
Green Neutral 19 0.1532258 15.3
Green Bad 46 0.3709677 37.1
None Good 160 0.2332362 23.3
None Neutral 368 0.5364431 53.6
None Bad 158 0.2303207 23.0
Other Good 68 0.2995595 30.0
Other Neutral 60 0.2643172 26.4
Other Bad 99 0.4361233 43.6
Red Good 36 0.2627737 26.3
Red Neutral 23 0.1678832 16.8
Red Bad 78 0.5693431 56.9

The students can choose which of the two above to use, they don’t need to do both. Same is true for all part i) for question 2

2A ii) Stacked Bar Chart

Create a stacked bar chart displaying the alignment percentage for each eye color group. Which eye color group is most likely to be a hero (alignment = Good)? What about a villain (alignment = Bad)?

ggplot(
  data = supers,
  mapping = aes(
    x = eye,
    fill = Alignment
  )
) + 
  
  # Creating a bar chart using geom_bar() since it is from the original data
  geom_bar(
    position = "fill",
    color = "black"
  ) + 
  
  # Changing the label on the x-axis
  labs(
    title = "Alignment by Eye Color",
    x = NULL,
    y = "Percentage"
  ) + 
  
  # Changing the theme
  theme_classic() + 
  
  # Changing the colors to darkblue and darkred
  scale_fill_manual(values = c("steelblue", "grey70", "tomato")) +
  
  # Changing the y-axis to be percentages and removing the extra space
  scale_y_continuous(
    labels = scales::label_percent(),
    expand = c(0, 0, 0.05, 0)
  )

Supers with blue eyes are the most likely to be a hero. Supers with red eyes are the most likely to be villains

ONLY APPLY THE BONUS POINT ONCE, NOT ON ALL 3 GRAPHS

Part 2B: Hair Color

Repeat part 2A, but with hair color instead of eye color.

2B i)

align_hair_df <- 
  supers |> 
  # Getting the combination of alignment and eye color
  count(hair, Alignment) |> 
  # Calculating the alignment prop by eye color group
  mutate(
    .by = hair,
    prop = n / sum(n),
    percent = round(prop*100, 1)
  )

gt(align_hair_df)
hair Alignment n prop percent
Black Good 184 0.5197740 52.0
Black Neutral 61 0.1723164 17.2
Black Bad 109 0.3079096 30.8
Blond Good 130 0.6220096 62.2
Blond Neutral 30 0.1435407 14.4
Blond Bad 49 0.2344498 23.4
Brown Good 107 0.5515464 55.2
Brown Neutral 35 0.1804124 18.0
Brown Bad 52 0.2680412 26.8
None Good 197 0.2213483 22.1
None Neutral 411 0.4617978 46.2
None Bad 282 0.3168539 31.7
Other Good 109 0.4325397 43.3
Other Neutral 42 0.1666667 16.7
Other Bad 101 0.4007937 40.1

2B ii)

Create a side-by-side bar chart. Supers with what hair color are the most likely to be heroes? What hair color is the most likely to be a villain?

ggplot(
  data = align_hair_df,
  mapping = aes(
    x = hair,
    fill = Alignment,
    y = percent
  )
) + 
  
  # Creating a bar chart using geom_bar() since it is from the original data
  geom_col(
    position = "dodge",
    color = "black"
  ) + 
  
  # Changing the label on the x-axis
  labs(
    title = "Alignment by Hair Color",
    x = NULL,
    y = "Percentage"
  ) + 
  
  # Changing the theme
  theme_classic() + 
  
  # Changing the colors to darkblue and darkred
  scale_fill_manual(values = c("steelblue", "grey70", "tomato")) +
  
  # Changing the y-axis to be percentages and removing the extra space
  scale_y_continuous(expand = c(0, 0, 0.05, 0))

If a hero has blond hair, they are the most likely to be a hero. If a hero has a non-black, non-blond, or non-brown hair color (just saying other is fine), they are the most likely to be a villain.

Question 3)

Which of the two style of graphs do you prefer: Segmented/Stacked or side-by-side? Briefly explain why!

Personally, I find the stacked bar chart to be better since it is easier to compare the groups on the x-axis across the levels of the y-axis, but as long as they justify their answer, you can give them full credit!