data <- read_xlsx ("../00_data/my data q&a.xlsx")
set.seed(1234)
depart_small <- data %>%
sample_n(10) %>%
select(age, height, weight)
df <- data.frame(
age = c(25, 25, 25, 33, 15, 24, 32, 25, 30, 29),
height = c(NA, 196, 190, 173, 149, 180, 180, NA, 171, 188),
weight = c(NA, 86, 90, 73, 45, 65, 78, NA, 76, 120)
)
df$age_factor <- factor(df$age)
library(dplyr)
library(ggplot2)
age_summary <- df %>%
group_by(age_factor) %>%
summarise(
avg_weight = mean(weight, na.rm = TRUE),
n = n()
)
# Bar chart with unordered factor
ggplot(age_summary, aes(x = age_factor, y = avg_weight)) +
geom_col() +
labs(title = "Average Weight by Age (Unordered)", x = "Age", y = "Average Weight")
# Reorder factor levels by avg_weight
age_summary <- age_summary %>%
mutate(age_factor = fct_reorder(age_factor, avg_weight))
# Plot again
ggplot(age_summary, aes(x = age_factor, y = avg_weight)) +
geom_col() +
labs(title = "Average Weight by Age (Ordered by Avg Weight)", x = "Age", y = "Average Weight")
# Convert age to a factor
df$age_factor <- factor(df$age)
# Summarize average weight per age_factor
age_summary <- df %>%
group_by(age_factor) %>%
summarise(avg_weight = mean(weight, na.rm = TRUE))
# Plot: reorder age_factor by avg_weight
ggplot(age_summary, aes(x = avg_weight, y = fct_reorder(age_factor, avg_weight))) +
geom_point(size = 3) +
labs(
title = "Average Weight by Age (Ordered by Average Weight)",
x = "Average Weight",
y = "Age"
) +
theme_minimal()
Show examples of three functions:
library(forcats)
df$age_group <- fct_recode(df$age_factor,
"Teen" = "15",
"Young Adult" = "24",
"Young Adult" = "25",
"Young Adult" = "29",
"Young Adult" = "30",
"Young Adult" = "32",
"Senior" = "33"
)
df[, c("age", "age_factor", "age_group")]
## age age_factor age_group
## 1 25 25 Young Adult
## 2 25 25 Young Adult
## 3 25 25 Young Adult
## 4 33 33 Senior
## 5 15 15 Teen
## 6 24 24 Young Adult
## 7 32 32 Young Adult
## 8 25 25 Young Adult
## 9 30 30 Young Adult
## 10 29 29 Young Adult
df$age_factor <- factor(df$age)
df$age_group <- fct_collapse(df$age_factor,
Teen = "15",
YoungAdult = c("24", "25", "29", "30", "32"),
Senior = "33"
)
# Convert age to factor
df$age_factor <- factor(df$age)
# Lump to keep top 3 most frequent ages; rest become "Other"
df$age_lumped <- fct_lump(df$age_factor, n = 3)
# View result
df[, c("age", "age_factor", "age_lumped")]
## age age_factor age_lumped
## 1 25 25 25
## 2 25 25 25
## 3 25 25 25
## 4 33 33 33
## 5 15 15 15
## 6 24 24 24
## 7 32 32 32
## 8 25 25 25
## 9 30 30 30
## 10 29 29 29
No need to do anything here.