library(tidyverse)
library(janitor)
library(lubridate)
theme_set(theme_minimal(base_size = 12))

Instructions

Use tidyverse (dplyr, tidyr, ggplot2). No Python.
Export required CSVs and PNGs to the project root.
Keep code readable; use pipes and comments.
If you use AI, note the prompt and what it changed, especially for visualization improvements.

Task A - Flight Delays

Wrangling

flights <- readr::read_csv("data/flights_dirty.csv") %>%
  clean_names() %>%
  mutate(
    dep_delay = as.numeric(trimws(as.character(dep_delay))),
    arr_delay = as.numeric(trimws(as.character(arr_delay)))
  ) %>%
  filter(between(dep_delay, -100, 600), between(arr_delay, -100, 600)) %>%
  mutate(airline_code = toupper(str_trim(airline_code))) %>%
  filter(nchar(airline_code) == 2)

## Rows: 140 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): airline_code, tailnum, destination
## dbl (4): flight, dep_delay, arr_delay, distance
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

airlines <- readr::read_csv("data/airlines_lookup.csv")

## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): airline_code, airline_name
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

flights_cleaned <- flights %>%
  left_join(airlines, by = "airline_code") %>%
  select(flight, airline_code, airline_name, destination, distance, dep_delay, arr_delay)

readr::write_csv(flights_cleaned, "flights_cleaned.csv")

Visualizations

# A1: Departure delay distribution by airline
ggplot(flights_cleaned, aes(x = airline_name, y = dep_delay)) +
  geom_boxplot(outlier.alpha = 0.3) +
  coord_flip() +
  labs(title = "Departure Delay by Airline", x = NULL, y = "Departure delay (min)")

ggsave("fig_flights_delay_by_airline.png", width = 8, height = 5, dpi = 150)

# A2: Mean arrival delay by destination
flights_dest <- flights_cleaned %>%
  group_by(destination) %>%
  summarise(mean_arr = mean(arr_delay, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_arr))

ggplot(flights_dest, aes(x = reorder(destination, mean_arr), y = mean_arr)) +
  geom_col() +
  coord_flip() +
  labs(title = "Mean Arrival Delay by Destination", x = "Destination", y = "Mean arrival delay (min)")

ggsave("fig_flights_arr_delay_by_dest.png", width = 8, height = 5, dpi = 150)

Insights

IA1: Which airlines show the highest variability in departure delays?
IA2: Which destination(s) have the worst arrival delays? Any plausible reasons?
IA3: Does longer distance tend to reduce or increase arrival delays in this sample?
IA4: AI usage to improve a plot (if any): describe the prompt and the change.

Task B - Retail Sales

Wrangling

sales <- readr::read_csv("data/sales_dirty.csv") %>%
  clean_names() %>%
  mutate(item = str_to_title(str_trim(item))) %>%
  distinct(order_id, item, .keep_all = TRUE) %>%
  mutate(price_usd = if_else(currency == "EUR", price * 1.1, price)) %>%
  filter(price_usd > 0) %>%
  mutate(date_parsed = suppressWarnings(parse_date_time(date, orders = c("mdY", "m/d/y", "Y-m-d")))) %>%
  mutate(year = year(date_parsed), month = month(date_parsed))

## Rows: 212 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): item, category, currency, date
## dbl (2): order_id, price
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sales_monthly <- sales %>%
  group_by(year, month) %>%
  summarise(monthly_revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")

sales_by_cat <- sales %>%
  group_by(year, month, category) %>%
  summarise(revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")

readr::write_csv(sales, "sales_cleaned.csv")
readr::write_csv(sales_monthly, "sales_monthly_summary.csv")

Visualizations

# B1: Monthly revenue trend
ggplot(sales_monthly, aes(x = as.Date(paste(year, month, 1, sep = "-")), y = monthly_revenue_usd)) +
  geom_line() +
  labs(title = "Monthly Revenue (USD)", x = "Month", y = "Revenue (USD)")

ggsave("fig_sales_monthly_trend.png", width = 8, height = 4.5, dpi = 150)

# B2: Revenue by category over time (stacked area)
sales_by_cat$date <- as.Date(paste(sales_by_cat$year, sales_by_cat$month, 1, sep = "-"))
ggplot(sales_by_cat, aes(x = date, y = revenue_usd, fill = category)) +
  geom_area() +
  labs(title = "Revenue by Category Over Time", x = "Month", y = "Revenue (USD)", fill = "Category")

ggsave("fig_sales_by_category.png", width = 8, height = 5, dpi = 150)

Insights

IB1: Which months are unusually high/low? Hypothesize why (data or business).
IB2: Which category drives most revenue? Is its share stable over time?
IB3: AI usage to improve a plot (if any): describe the prompt and the change.

Submission Checklist

flights_cleaned.csv
fig_flights_delay_by_airline.png
fig_flights_arr_delay_by_dest.png
sales_cleaned.csv
sales_monthly_summary.csv
fig_sales_monthly_trend.png
fig_sales_by_category.png
(Optional) fig_flights_distance_vs_arr.png, fig_sales_top_items.png
Insights answered under each task ```

Midterm — Data Wrangling and Visualizations (R only)

Instructions

Task A - Flight Delays

Wrangling

Visualizations

Insights

Task B - Retail Sales

Wrangling

Visualizations

Insights

Submission Checklist