library(tidyverse)
library(janitor)
library(lubridate)
theme_set(theme_minimal(base_size = 12))
flights <- readr::read_csv("data/flights_dirty.csv") %>%
clean_names() %>% # 1) standardize column names
mutate(
dep_delay = as.numeric(trimws(as.character(dep_delay))),
arr_delay = as.numeric(trimws(as.character(arr_delay)))
) %>% # 2) make delays numeric
filter(between(dep_delay, -100, 600),
between(arr_delay, -100, 600)) %>% # 3) realistic delay bounds
mutate(airline_code = toupper(str_trim(airline_code))) %>%
filter(nchar(airline_code) == 2) # 4) clean 2-letter codes
airlines <- readr::read_csv("data/airlines_lookup.csv")
flights_cleaned <- flights %>%
left_join(airlines, by = "airline_code") %>% # 5) join airline names
select(flight, airline_code, airline_name,
destination, distance, dep_delay, arr_delay) # 6) keep required cols
head(flights_cleaned)
readr::write_csv(flights_cleaned, "flights_cleaned.csv") # 7) saves in your working directory
# A1: Departure delay distribution by airline
delay_patterns <- ggplot(
flights_cleaned,
aes(x = reorder(airline_name, dep_delay, FUN = median, na.rm = TRUE),
y = dep_delay)
) +
geom_boxplot(outlier.alpha = 0.3, fill = "lightblue") +
geom_hline(yintercept = 0, linetype = "dashed") +
coord_flip() +
labs(title = "Departure Delay by Airline",
x = NULL, y = "Departure delay (minutes)")
delay_patterns
ggsave("fig_flights_delay_by_airline.png",
plot = delay_patterns, width = 8, height = 5, dpi = 150)
# A2: Mean arrival delay by destination
flights_dest <- flights_cleaned %>%
group_by(destination) %>%
summarise(mean_arr = mean(arr_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_arr))
mean_arrival_plot <- ggplot(
flights_dest,
aes(x = reorder(destination, mean_arr), y = mean_arr)
) +
geom_col() +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Mean Arrival Delay by Destination",
x = "Destination",
y = "Mean arrival delay (minutes)")
mean_arrival_plot
ggsave("fig_flights_arr_delay_by_dest.png",
plot = mean_arrival_plot, width = 8, height = 5, dpi = 150)
# A3: Relationship between distance and arrival delay
distance_vs_arr <- ggplot(flights_cleaned,
aes(x = distance, y = arr_delay)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Arrival Delay vs Flight Distance",
x = "Distance (miles)",
y = "Arrival delay (minutes)")
distance_vs_arr
ggsave("fig_flights_distance_vs_arr.png",
plot = distance_vs_arr, width = 8, height = 5, dpi = 150)
# Check the slope sign
model <- lm(arr_delay ~ distance, data = flights_cleaned)
summary(model)
##
## Call:
## lm(formula = arr_delay ~ distance, data = flights_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.484 -15.621 3.437 17.256 68.012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.195347 4.693745 0.255 0.799
## distance 0.002802 0.002955 0.948 0.345
##
## Residual standard error: 25.05 on 130 degrees of freedom
## Multiple R-squared: 0.006867, Adjusted R-squared: -0.000772
## F-statistic: 0.8989 on 1 and 130 DF, p-value: 0.3448
flights_cleaned %>%
group_by(airline_name) %>%
summarise(
sd_dep_delay = sd(dep_delay, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(sd_dep_delay))
Southwest shows the highest variability in departure delays, with JetBlue also having relatively high spread. Their boxplots have the widest boxes and whiskers, and their standard deviations of departure delay are the largest among the airlines.
flights_dest %>%
arrange(desc(mean_arr))
The worst average arrival delays are for flights to MCO, followed by LAX, which have the largest positive mean arrival delays in the bar chart. Both are large, busy airports with heavy traffic, so congestion and possible weather disruptions likely contribute to their longer average delays.
In this sample, longer distance is associated with only a very slight increase in arrival delays. The trend line in the distance–delay scatterplot slopes gently upward, but the regression slope is small and explains almost none of the variation, so distance does not have a meaningful practical effect on arrival delay.
I used AI to improve the clarity of my delay visualizations. I asked for help making the airline and destination plots easier to interpret while keeping the calculations the same. Following the suggestions, I (1) reordered airlines and destinations by their median/mean delay so the worst performers appear at the top of each plot, (2) added a dashed reference line at 0 minutes to separate early/on-time from late flights, and (3) used a single light-blue fill for the boxplots to keep the figure readable instead of using many different colours. These changes made it faster to see which airlines and destinations have the largest delays.
sales <- readr::read_csv("data/sales_dirty.csv") %>%
clean_names() %>%
mutate(item = str_to_title(str_trim(item))) %>% # standardize item
distinct(order_id, item, .keep_all = TRUE) %>% # remove duplicates
mutate(price_usd = if_else(currency == "EUR", price * 1.1, price)) %>% # EUR -> USD
filter(price_usd > 0) %>% # drop non-positive
mutate(date_parsed = suppressWarnings(parse_date_time(date,
orders = c("mdY", "m/d/y", "Y-m-d")))) %>%
mutate(year = year(date_parsed), month = month(date_parsed))
sales_monthly <- sales %>%
group_by(year, month) %>%
summarise(monthly_revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")
sales_by_cat <- sales %>%
group_by(year, month, category) %>%
summarise(revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")
readr::write_csv(sales, "sales_cleaned.csv")
readr::write_csv(sales_monthly, "sales_monthly_summary.csv")
# B1: Monthly revenue trend
ggplot(sales_monthly, aes(x = as.Date(paste(year, month, 1, sep = "-")), y = monthly_revenue_usd)) +
geom_line() +
geom_point() + # show each month
geom_point( # highlight November spike
data = subset(sales_monthly, month == 11),
colour = "red", size = 3
) +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
labs(title = "Monthly Revenue (USD)", x = "Month (2023)", y = "Revenue (USD)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("fig_sales_monthly_trend.png", width = 8, height = 4.5, dpi = 150)
# B2: Revenue by category over time (stacked area)
sales_by_cat$date <- as.Date(paste(sales_by_cat$year, sales_by_cat$month, 1, sep = "-"))
ggplot(sales_by_cat, aes(x = date, y = revenue_usd, fill = category)) +
geom_area() +
labs(title = "Revenue by Category Over Time", x = "Month", y = "Revenue (USD)", fill = "Category")
ggsave("fig_sales_by_category.png", width = 8, height = 5, dpi = 150)
# B3: Top 5 items by total revenue (bar)
sales_top_items <- sales %>%
group_by(item) %>%
summarise(total_revenue_usd = sum(price_usd, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(total_revenue_usd)) %>%
slice_head(n = 5)
top_items_plot <- ggplot(sales_top_items,
aes(x = reorder(item, total_revenue_usd),
y = total_revenue_usd)) +
geom_col() +
labs(title = "Top 5 Items by Total Revenue",
x = "Item",
y = "Total revenue (USD)")
top_items_plot
ggsave("fig_sales_top_items.png",
plot = top_items_plot, width = 7, height = 4.5, dpi = 150)
B3 insight. The top five items by revenue are Cable, Gadget C, Widget A, Widget B, and Adapter. Cable alone generates roughly two-thirds of total revenue, and together these five items account for about 96% of all sales, showing that revenue is highly concentrated in a small set of products.
Monthly revenue is unusually high in November, where sales jump to around $11.6K, far above all other months. This likely reflects seasonal promotions and holiday/Black Friday shopping. The lowest revenue occurs in July (with May also relatively weak), which could represent a slower mid-year period with fewer campaigns or reduced customer demand.
I used AI to refine my sales plots. For B1, I asked how to make the monthly revenue trend easier to read; based on the suggestions I kept a simple line chart, added points for each month, highlighted the large November spike in red, and used month labels on a rotated date axis. For B3, I asked how to clearly show the top-revenue items; AI helped me build a vertical bar chart ordered by total revenue with clear axis titles in USD, making it obvious that Cable generates far more revenue than the other products.