library(tidyverse)
library(janitor)
library(lubridate)
theme_set(theme_minimal(base_size = 12))

Task A - Flight Delays

Wrangling

flights <- readr::read_csv("data/flights_dirty.csv") %>%
  clean_names() %>%                                  # 1) standardize column names
  mutate(
    dep_delay = as.numeric(trimws(as.character(dep_delay))),
    arr_delay = as.numeric(trimws(as.character(arr_delay)))
  ) %>%                                             # 2) make delays numeric
  filter(between(dep_delay, -100, 600), 
         between(arr_delay, -100, 600)) %>%         # 3) realistic delay bounds
  mutate(airline_code = toupper(str_trim(airline_code))) %>%
  filter(nchar(airline_code) == 2)                  # 4) clean 2-letter codes

airlines <- readr::read_csv("data/airlines_lookup.csv")

flights_cleaned <- flights %>%
  left_join(airlines, by = "airline_code") %>%      # 5) join airline names
  select(flight, airline_code, airline_name, 
         destination, distance, dep_delay, arr_delay)  # 6) keep required cols

head(flights_cleaned)
readr::write_csv(flights_cleaned, "flights_cleaned.csv") # 7) saves in your working directory

Visualizations

# A1: Departure delay distribution by airline
delay_patterns <- ggplot(
  flights_cleaned,
  aes(x = reorder(airline_name, dep_delay, FUN = median, na.rm = TRUE),
      y = dep_delay)
) +
  geom_boxplot(outlier.alpha = 0.3, fill = "lightblue") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip() +
  labs(title = "Departure Delay by Airline",
       x = NULL, y = "Departure delay (minutes)")

delay_patterns

ggsave("fig_flights_delay_by_airline.png",
       plot = delay_patterns, width = 8, height = 5, dpi = 150)
# A2: Mean arrival delay by destination
flights_dest <- flights_cleaned %>%
  group_by(destination) %>%
  summarise(mean_arr = mean(arr_delay, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_arr))

mean_arrival_plot <- ggplot(
  flights_dest,
  aes(x = reorder(destination, mean_arr), y = mean_arr)
) +
  geom_col() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Mean Arrival Delay by Destination",
       x = "Destination",
       y = "Mean arrival delay (minutes)")

mean_arrival_plot

ggsave("fig_flights_arr_delay_by_dest.png",
       plot = mean_arrival_plot, width = 8, height = 5, dpi = 150)
# A3: Relationship between distance and arrival delay
distance_vs_arr <- ggplot(flights_cleaned,
                          aes(x = distance, y = arr_delay)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Arrival Delay vs Flight Distance",
       x = "Distance (miles)",
       y = "Arrival delay (minutes)")

distance_vs_arr

ggsave("fig_flights_distance_vs_arr.png",
       plot = distance_vs_arr, width = 8, height = 5, dpi = 150)
# Check the slope sign
model <- lm(arr_delay ~ distance, data = flights_cleaned)
summary(model)
## 
## Call:
## lm(formula = arr_delay ~ distance, data = flights_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.484 -15.621   3.437  17.256  68.012 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.195347   4.693745   0.255    0.799
## distance    0.002802   0.002955   0.948    0.345
## 
## Residual standard error: 25.05 on 130 degrees of freedom
## Multiple R-squared:  0.006867,   Adjusted R-squared:  -0.000772 
## F-statistic: 0.8989 on 1 and 130 DF,  p-value: 0.3448

Insights

IA1: Which airlines show the highest variability in departure delays?

flights_cleaned %>%
  group_by(airline_name) %>%
  summarise(
    sd_dep_delay = sd(dep_delay, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(sd_dep_delay))

Southwest shows the highest variability in departure delays, with JetBlue also having relatively high spread. Their boxplots have the widest boxes and whiskers, and their standard deviations of departure delay are the largest among the airlines.

- IA2: Which destination(s) have the worst arrival delays? Any plausible reasons?

flights_dest %>%
  arrange(desc(mean_arr))

The worst average arrival delays are for flights to MCO, followed by LAX, which have the largest positive mean arrival delays in the bar chart. Both are large, busy airports with heavy traffic, so congestion and possible weather disruptions likely contribute to their longer average delays.

- IA3: Does longer distance tend to reduce or increase arrival delays in this sample?

In this sample, longer distance is associated with only a very slight increase in arrival delays. The trend line in the distance–delay scatterplot slopes gently upward, but the regression slope is small and explains almost none of the variation, so distance does not have a meaningful practical effect on arrival delay.

- IA4: AI usage to improve a plot (if any): describe the prompt and the change.

I used AI to improve the clarity of my delay visualizations. I asked for help making the airline and destination plots easier to interpret while keeping the calculations the same. Following the suggestions, I (1) reordered airlines and destinations by their median/mean delay so the worst performers appear at the top of each plot, (2) added a dashed reference line at 0 minutes to separate early/on-time from late flights, and (3) used a single light-blue fill for the boxplots to keep the figure readable instead of using many different colours. These changes made it faster to see which airlines and destinations have the largest delays.

Task B - Retail Sales

Wrangling

sales <- readr::read_csv("data/sales_dirty.csv") %>%
  clean_names() %>%                                           
  mutate(item = str_to_title(str_trim(item))) %>%             # standardize item
  distinct(order_id, item, .keep_all = TRUE) %>%              # remove duplicates
  mutate(price_usd = if_else(currency == "EUR", price * 1.1, price)) %>%   # EUR -> USD
  filter(price_usd > 0) %>%                                   # drop non-positive
  mutate(date_parsed = suppressWarnings(parse_date_time(date, 
                                                        orders = c("mdY", "m/d/y", "Y-m-d")))) %>%
  mutate(year = year(date_parsed), month = month(date_parsed))

sales_monthly <- sales %>%
  group_by(year, month) %>%
  summarise(monthly_revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")

sales_by_cat <- sales %>%
  group_by(year, month, category) %>%
  summarise(revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")

readr::write_csv(sales, "sales_cleaned.csv")
readr::write_csv(sales_monthly, "sales_monthly_summary.csv")

Visualizations

# B1: Monthly revenue trend
ggplot(sales_monthly, aes(x = as.Date(paste(year, month, 1, sep = "-")), y = monthly_revenue_usd)) +
  geom_line() +
  geom_point() +  # show each month
  geom_point(     # highlight November spike
    data   = subset(sales_monthly, month == 11),
    colour = "red", size = 3
  ) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  labs(title = "Monthly Revenue (USD)", x = "Month (2023)", y = "Revenue (USD)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("fig_sales_monthly_trend.png", width = 8, height = 4.5, dpi = 150)
# B2: Revenue by category over time (stacked area)
sales_by_cat$date <- as.Date(paste(sales_by_cat$year, sales_by_cat$month, 1, sep = "-"))
ggplot(sales_by_cat, aes(x = date, y = revenue_usd, fill = category)) +
  geom_area() +
  labs(title = "Revenue by Category Over Time", x = "Month", y = "Revenue (USD)", fill = "Category")

ggsave("fig_sales_by_category.png", width = 8, height = 5, dpi = 150)
# B3: Top 5 items by total revenue (bar)
sales_top_items <- sales %>%
  group_by(item) %>%
  summarise(total_revenue_usd = sum(price_usd, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(total_revenue_usd)) %>%
  slice_head(n = 5)

top_items_plot <- ggplot(sales_top_items,
                         aes(x = reorder(item, total_revenue_usd),
                             y = total_revenue_usd)) +
  geom_col() +
  labs(title = "Top 5 Items by Total Revenue",
       x = "Item",
       y = "Total revenue (USD)")

top_items_plot

ggsave("fig_sales_top_items.png",
       plot = top_items_plot, width = 7, height = 4.5, dpi = 150)

B3 insight. The top five items by revenue are Cable, Gadget C, Widget A, Widget B, and Adapter. Cable alone generates roughly two-thirds of total revenue, and together these five items account for about 96% of all sales, showing that revenue is highly concentrated in a small set of products.

Insights

- IB1: Which months are unusually high/low? Hypothesize why (data or business).

Monthly revenue is unusually high in November, where sales jump to around $11.6K, far above all other months. This likely reflects seasonal promotions and holiday/Black Friday shopping. The lowest revenue occurs in July (with May also relatively weak), which could represent a slower mid-year period with fewer campaigns or reduced customer demand.

- IB2: Which category drives most revenue? Is its share stable over time?

Overall, Accessories drive the most revenue by a large margin, generating over $13K compared with about $2–2.4K for Gadgets and Widgets. However, its share is not stable over time: Widgets lead in some early months, Gadgets dominate in months like May, and Accessories take over in others (especially November, where they account for almost all revenue). The category mix clearly shifts across the year.

- IB3: AI usage to improve a plot (if any): describe the prompt and the change.

I used AI to refine my sales plots. For B1, I asked how to make the monthly revenue trend easier to read; based on the suggestions I kept a simple line chart, added points for each month, highlighted the large November spike in red, and used month labels on a rotated date axis. For B3, I asked how to clearly show the top-revenue items; AI helped me build a vertical bar chart ordered by total revenue with clear axis titles in USD, making it obvious that Cable generates far more revenue than the other products.