Import your data

data <- read_xlsx ("../00_data/my data q&a.xlsx")
set.seed(1234)
depart_small <- data %>%
  sample_n(10) %>%
  select(age, height, weight)

Chapter 15

Create a factor

df <- data.frame(
  age = c(25, 25, 25, 33, 15, 24, 32, 25, 30, 29),
  height = c(NA, 196, 190, 173, 149, 180, 180, NA, 171, 188),
  weight = c(NA, 86, 90, 73, 45, 65, 78, NA, 76, 120)
)

df$age_factor <- factor(df$age)

library(dplyr)
library(ggplot2)

age_summary <- df %>%
  group_by(age_factor) %>%
  summarise(
    avg_weight = mean(weight, na.rm = TRUE),
    n = n()
  )

# Bar chart with unordered factor
ggplot(age_summary, aes(x = age_factor, y = avg_weight)) +
  geom_col() +
  labs(title = "Average Weight by Age (Unordered)", x = "Age", y = "Average Weight")

# Reorder factor levels by avg_weight
age_summary <- age_summary %>%
  mutate(age_factor = fct_reorder(age_factor, avg_weight))

# Plot again
ggplot(age_summary, aes(x = age_factor, y = avg_weight)) +
  geom_col() +
  labs(title = "Average Weight by Age (Ordered by Avg Weight)", x = "Age", y = "Average Weight")

# Convert age to a factor
df$age_factor <- factor(df$age)

# Summarize average weight per age_factor
age_summary <- df %>%
  group_by(age_factor) %>%
  summarise(avg_weight = mean(weight, na.rm = TRUE))

# Plot: reorder age_factor by avg_weight
ggplot(age_summary, aes(x = avg_weight, y = fct_reorder(age_factor, avg_weight))) +
  geom_point(size = 3) +
  labs(
    title = "Average Weight by Age (Ordered by Average Weight)",
    x = "Average Weight",
    y = "Age"
  ) +
  theme_minimal()

Modify factor levels

Show examples of three functions:

  • fct_recode
library(forcats)
df$age_group <- fct_recode(df$age_factor,
  "Teen" = "15",
  "Young Adult" = "24",
  "Young Adult" = "25",
  "Young Adult" = "29",
  "Young Adult" = "30",
  "Young Adult" = "32",
  "Senior" = "33"
)
df[, c("age", "age_factor", "age_group")]
##    age age_factor   age_group
## 1   25         25 Young Adult
## 2   25         25 Young Adult
## 3   25         25 Young Adult
## 4   33         33      Senior
## 5   15         15        Teen
## 6   24         24 Young Adult
## 7   32         32 Young Adult
## 8   25         25 Young Adult
## 9   30         30 Young Adult
## 10  29         29 Young Adult
  • fct_collapse
df$age_factor <- factor(df$age)
df$age_group <- fct_collapse(df$age_factor,
  Teen = "15",
  YoungAdult = c("24", "25", "29", "30", "32"),
  Senior = "33"
)
  • fct_lump
# Convert age to factor
df$age_factor <- factor(df$age)

# Lump to keep top 3 most frequent ages; rest become "Other"
df$age_lumped <- fct_lump(df$age_factor, n = 3)

# View result
df[, c("age", "age_factor", "age_lumped")]
##    age age_factor age_lumped
## 1   25         25         25
## 2   25         25         25
## 3   25         25         25
## 4   33         33         33
## 5   15         15         15
## 6   24         24         24
## 7   32         32         32
## 8   25         25         25
## 9   30         30         30
## 10  29         29         29

Chapter 16

No need to do anything here.