This extended study guide helps you prepare for the Data Wrangling & Visualization midterm using new practice datasets (distinct from the actual exam). It focuses on: - Cleaning messy data into tidy formats (names, types, text standardization, outliers) - Summaries and grouping (by category and time) - Visualization patterns with ggplot2 - Writing clear insights grounded in evidence from your plots - Responsible use of AI tools for formatting/clarity and debugging, not for full solutions
Assessment weighting: 50% Wrangling, 50% Visualizations & Insights.
library(tidyverse) # dplyr, tidyr, ggplot2
library(janitor) # clean_names()
library(lubridate) # dates
Core pipes:
# pattern: read -> clean -> wrangle -> summarize/visualize
## Set your workdirectory to source file location using the session tab on the top menu bar
## save your r file and data files on the same directory
## copy the file path to your desired data file and paste it inside the double quotes for the read_csv command below
## use the commented code chunck below as sample code
# read_csv("path.csv") %>%
# clean_names() %>%
# mutate(...) %>%
# filter(...) %>%
# group_by(...) %>%
# summarise(..., .groups = "drop")
Below we create practice-only datasets you can safely use to study.
set.seed(123)
n <- 240
students_scores <- tibble(
student_id = sprintf("S%03d", 1:n),
gender = sample(c("Female","Male"), n, replace = TRUE, prob = c(0.55, 0.45)),
math_score = pmax(pmin(round(rnorm(n, 75, 12)), 100), 30),
reading_score= pmax(pmin(round(rnorm(n, 78, 11)), 100), 30),
writing_score= pmax(pmin(round(rnorm(n, 76, 13)), 100), 30)
) %>%
mutate(across(ends_with("_score"), ~ ifelse(runif(n()) < 0.04, NA, .))) # sprinkle a few NAs
# Optional: save for practice (comment out if not needed)
# readr::write_csv(students_scores, "students_scores.csv")
students_scores %>% head()
Goals to practice with this dataset: - Clean names,
compute avg_score - Compare distributions by gender -
Identify high performers and potential outliers
set.seed(42)
months <- seq(ymd("2023-01-01"), ymd("2023-12-01"), by = "1 month")
bike_sales <- expand_grid(
month = months,
region = c("North","South","East","West"),
bike_type= c("Road","Mountain","Hybrid","Kids")
) %>%
mutate(
units_sold = rpois(n(), lambda = 50) + sample(0:20, n(), replace = TRUE),
revenue_usd = units_sold * (case_when(
bike_type == "Road" ~ 850,
bike_type == "Mountain" ~ 950,
bike_type == "Hybrid" ~ 650,
TRUE ~ 300
)) * runif(n(), 0.85, 1.15)
)
# Optional: save for practice (comment out if not needed)
# readr::write_csv(bike_sales, "bike_sales.csv")
bike_sales %>% head()
Goals to practice: - Build monthly summaries and category mixes - Create line and stacked area charts - Write short, decision-ready insights
scores <- students_scores %>%
clean_names() %>%
mutate(
avg_score = rowMeans(select(., ends_with("_score")), na.rm = TRUE)
)
scores %>% select(student_id, gender, avg_score) %>% head()
Notes - clean_names(): lower_snake_case
- rowMeans() with na.rm = TRUE handles missing
scores gracefully
high_achievers <- scores %>%
filter(avg_score >= 90) %>%
select(student_id, gender, avg_score) %>%
arrange(desc(avg_score))
high_achievers %>% head()
by_gender <- scores %>%
group_by(gender) %>%
summarise(
mean_avg = mean(avg_score, na.rm = TRUE),
sd_avg = sd(avg_score, na.rm = TRUE),
n = n(),
.groups = "drop"
)
by_gender
Try: Add quartiles with
quantile(avg_score, probs = c(.25,.5,.75), na.rm = TRUE).
demo <- tibble(dept = c(" sales ", "Sales", "SALES", " marketing", "Marketing "))
demo %>% mutate(dept_clean = str_to_title(str_trim(dept)))
bike_clean <- bike_sales %>%
mutate(year = year(month), month_num = month(month))
bike_clean %>% head()
Aim for clear titles, labeled axes, readable scales, and captions when needed.
ggplot(scores, aes(x = gender, y = avg_score)) +
geom_boxplot(outlier.alpha = 0.25) +
labs(title = "Distribution of Average Scores by Gender",
x = NULL, y = "Average Score")
Insight prompts - Which group shows larger variability? - Are there extreme outliers? What could cause them?
mean_by_gender <- scores %>%
group_by(gender) %>%
summarise(mean_avg = mean(avg_score, na.rm = TRUE), .groups = "drop")
ggplot(mean_by_gender, aes(x = reorder(gender, mean_avg), y = mean_avg)) +
geom_col() +
coord_flip() +
labs(title = "Mean Average Score by Gender", x = NULL, y = "Mean Average Score")
ggplot(bike_clean, aes(x = month, y = revenue_usd, color = region, group = region)) +
geom_line(linewidth = 1) +
labs(title = "Monthly Bike Revenue by Region",
x = "Month", y = "Revenue (USD)", color = "Region")
by_cat <- bike_clean %>%
group_by(month, bike_type) %>%
summarise(revenue = sum(revenue_usd, na.rm = TRUE), .groups = "drop")
ggplot(by_cat, aes(x = month, y = revenue, fill = bike_type)) +
geom_area() +
labs(title = "Revenue by Bike Type Over Time",
x = "Month", y = "Revenue (USD)", fill = "Bike Type")
ggplot(bike_clean, aes(x = month, y = revenue_usd, color = bike_type)) +
geom_line(linewidth = .9) +
facet_wrap(~ region, ncol = 2) +
labs(title = "Revenue by Region and Type",
x = "Month", y = "Revenue (USD)", color = "Type")
Keep insights concise, evidence-based, and tied to charts. Avoid speculation without supporting data.
avg_score. Are they concentrated in one gender?avg_score. Experiment with bin widths using
bins =.bike_type
using normalized shares (proportion per month).Sketch of #4:
shares <- by_cat %>%
group_by(month) %>%
mutate(share = revenue / sum(revenue, na.rm = TRUE)) %>%
ungroup()
ggplot(shares, aes(month, share, fill = bike_type)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Type Shares Over Time", y = "Share of Monthly Revenue", x = "Month", fill = "Type")
Allowed use cases: - Debug syntax errors or understand error messages - Improve labels, themes, scales for readability - Brainstorm aesthetic options (not logic)
Not allowed: - Full solutions or writing your entire code
Reflection template (if you used AI): 1. What was the exact question or error? 2. What suggestion did AI offer? 3. What did you keep/modify, and why? 4. How did you verify the result?
scores_long <- students_scores %>%
clean_names() %>%
pivot_longer(cols = ends_with("_score"),
names_to = "subject",
values_to = "score")
scores_long %>% head()
subject_means <- scores_long %>%
group_by(subject) %>%
summarise(mean_score = mean(score, na.rm = TRUE), .groups = "drop")
ggplot(subject_means, aes(x = reorder(subject, mean_score), y = mean_score)) +
geom_col() +
coord_flip() +
labs(title = "Mean Score by Subject", x = "Subject", y = "Mean Score")
End of Study Guide