library(tidyverse)
library(janitor)
library(lubridate)
theme_set(theme_minimal(base_size = 12))
readr::read_csv("C:/Users/jacin/Downloads/airlines_lookup.csv")
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): airline_code, airline_name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 2
## airline_code airline_name
## <chr> <chr>
## 1 AA American Airlines
## 2 DL Delta Air Lines
## 3 UA United Airlines
## 4 B6 JetBlue
## 5 WN Southwest
## 6 AS Alaska Airlines
readr::read_csv("C:/Users/jacin/Downloads/flights_dirty.csv")
## Rows: 140 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): airline_code, tailnum, destination
## dbl (4): flight, dep_delay, arr_delay, distance
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 140 × 7
## flight airline_code tailnum dep_delay arr_delay destination distance
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 9716 WN N79638 46 44 MCO 1546
## 2 5076 AA N26690 -2 -23 LAX 1352
## 3 3595 b6 N67382 2 -1 ORD 1529
## 4 8120 b6 N62798 15 12 MCO 877
## 5 488 DL N26878 31 28 CLT 1836
## 6 7493 DL N90999 20 27 ATL 925
## 7 2711 DL N35642 -19 -18 ATL 799
## 8 7740 UA N36819 12 -999 PHX 1997
## 9 1813 UA N39790 8 1 CLT 2011
## 10 5102 DL N20660 -1 -16 ATL 458
## # ℹ 130 more rows
flights <- readr::read_csv("C:/Users/jacin/Downloads/flights_dirty.csv") %>%
clean_names() %>%
mutate(
dep_delay = as.numeric(trimws(as.character(dep_delay))),
arr_delay = as.numeric(trimws(as.character(arr_delay)))
) %>%
filter(between(dep_delay, -100, 600), between(arr_delay, -100, 600)) %>%
mutate(airline_code = toupper(str_trim(airline_code))) %>%
filter(nchar(airline_code) == 2)
## Rows: 140 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): airline_code, tailnum, destination
## dbl (4): flight, dep_delay, arr_delay, distance
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
airlines <- readr::read_csv("C:/Users/jacin/Downloads/airlines_lookup.csv")
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): airline_code, airline_name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
flights_cleaned <- flights %>%
left_join(airlines, by = "airline_code") %>%
select(flight, airline_code, airline_name, destination, distance, dep_delay, arr_delay)
readr::write_csv(flights_cleaned, "flights_cleaned.csv")
# A1: Departure delay distribution by airline
ggplot(flights_cleaned, aes(x = airline_name, y = dep_delay)) +
geom_boxplot(outlier.alpha = 0.3) +
coord_flip() +
labs(title = "Departure Delay by Airline", x = NULL, y = "Departure delay (min)")
ggsave("fig_flights_delay_by_airline.png", width = 8, height = 5, dpi = 150)
# A2: Mean arrival delay by destination
flights_dest <- flights_cleaned %>%
group_by(destination) %>%
summarise(mean_arr = mean(arr_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_arr))
ggplot(flights_dest, aes(x = reorder(destination, mean_arr), y = mean_arr)) +
geom_col() +
coord_flip() +
labs(title = "Mean Arrival Delay by Destination", x = "Destination", y = "Mean arrival delay (min)")
ggsave("fig_flights_arr_delay_by_dest.png", width = 8, height = 5, dpi = 150)
#A3 optional
ggplot(flights_cleaned, aes(x = distance, y = arr_delay)) +
geom_smooth(outlier.alpha = 0.3) +
coord_flip() +
labs(title = "Relationship Between Distance and Arrival Delay", x = "Distance (miles)", y = "Arrival delay (min)")
## Warning in geom_smooth(outlier.alpha = 0.3): Ignoring unknown parameters:
## `outlier.alpha`
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggsave("fig_flights_distance_vs_arr.png", width = 8, height = 5, dpi = 150)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#We can see the variability by looking at the length of a plot. So for this boxplot, we look from the lowest point to the highest point or the (farthest outlier), because that represents the range or variability of the data. United Airlines and JetBlue show the highest variability in their departure delay data based on their IQR, while Alaska Airlines and Southwest show the largest overall range and greatest variability in their upper 25% of delays.
#The destinations with the worst arrival delays are MCO (Orlando) and LAX (Los Angeles). MCO has the highest mean arrival delay, at about 17 minutes. LAX has the second-highest mean arrival delay, at about 15 minutes. Some potential reasons for this may be that delays at major airports like LAX and MCO are typically due to the high traffic volume of these locations and the weather. Orlando is known for frequent storms, that could possibly affect travel.
#The link between flight distance and being late seems to be unpredictable. For short trips (under 1,000 miles), flights start on time but delays quickly increase to about 10 minutes. This may be due to sall issues popping up on flights. Interestingly, the medium distance flights (1,000 to 2,000 miles) are the most reliable, seeing those delays drop back down near zero. But for the longest cross-country flights (over 2,000 miles); their delays jump significantly to over 20 minutes, probably because earlier smaller problems can pile up over the long route.
readr::read_csv("C:/Users/jacin/Downloads/sales_dirty.csv")
## Rows: 212 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): item, category, currency, date
## dbl (2): order_id, price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 212 × 6
## order_id item category currency price date
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 5101 GADGET C Widgets EUR 52.4 07/22/23
## 2 5035 gadget c Accessories USD 35.4 2023-06-21
## 3 5308 Cable Gadgets USD 48.8 2023-12-10
## 4 5158 Widget C Accessories EUR 35.4 04-11-2023
## 5 5084 Cable Gadgets EUR 34.8 10-21-2023
## 6 5291 GADGET C Accessories EUR 53.9 2023-04-12
## 7 5015 Cable Accessories USD 55.1 2023-01-16
## 8 5288 Widget A Widgets USD 41.8 2023-02-21
## 9 5198 Cable Widgets EUR 39.1 09/27/23
## 10 5353 GADGET C Gadgets USD 29.6 2023-03-20
## # ℹ 202 more rows
sales <- readr::read_csv("C:/Users/jacin/Downloads/sales_dirty.csv") %>%
clean_names() %>%
mutate(item = str_to_title(str_trim(item))) %>%
distinct(order_id, item, .keep_all = TRUE) %>%
mutate(price_usd = if_else(currency == "EUR", price * 1.1, price)) %>%
filter(price_usd > 0) %>%
mutate(date_parsed = suppressWarnings(parse_date_time(date, orders = c("mdY", "m/d/y", "Y-m-d")))) %>%
mutate(year = year(date_parsed), month = month(date_parsed))
## Rows: 212 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): item, category, currency, date
## dbl (2): order_id, price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sales_monthly <- sales %>%
group_by(year, month) %>%
summarise(monthly_revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")
sales_by_cat <- sales %>%
group_by(year, month, category) %>%
summarise(revenue_usd = sum(price_usd, na.rm = TRUE), .groups = "drop")
readr::write_csv(sales, "sales_cleaned.csv")
readr::write_csv(sales_monthly, "sales_monthly_summary.csv")
# B1: Monthly revenue trend
ggplot(sales_monthly, aes(x = as.Date(paste(year, month, 1, sep = "-")), y = monthly_revenue_usd)) +
geom_line() +
labs(title = "Monthly Revenue (USD)", x = "Month", y = "Revenue (USD)")
ggsave("fig_sales_monthly_trend.png", width = 8, height = 4.5, dpi = 150)
# B2: Revenue by category over time (stacked area)
sales_by_cat$date <- as.Date(paste(sales_by_cat$year, sales_by_cat$month, 1, sep = "-"))
ggplot(sales_by_cat, aes(x = date, y = revenue_usd, fill = category)) +
geom_area() +
labs(title = "Revenue by Category Over Time", x = "Month", y = "Revenue (USD)", fill = "Category")
ggsave("fig_sales_by_category.png", width = 8, height = 5, dpi = 150)
#B3 optional
top_5_items <- sales %>%
group_by(item) %>%
summarise(Total_Revenue = sum(price_usd, na.rm = TRUE)) %>%
arrange(desc(Total_Revenue)) %>%
slice_head(n = 5)
ggplot(top_5_items, aes(x = reorder(item, Total_Revenue), y = Total_Revenue)) +
geom_col() +
coord_flip() +
labs(
title = "Top 5 Items by Total Revenue", x = "Item", y = "Total Revenue (USD)"
)
ggsave("fig_sales_top_items.png", width = 8, height = 4.5, dpi = 150)
#The month of November 2023 is unusually high, showing a spike in revenue, peaking at close to $12,000. The large amount of technology items suggests this is due to holiday shopping, probably a mixture of Black Friday and Cyber Monday. Compared to November, months January 2023 through October 2023, and December 2023, are unusually low. The data for December 2023 and July 2023 drops fairly lower than the other months. This may be due to seasonal trends and changes such as summer coming to an end and the back to school season and the holiday shopping coming to an end with more consumers slowind down in spending.
#The Accessories category drives the majority of the revenue. The share of Accessories is highly unstable over time with a large spike in November. From January 2023 to October 2023, accessories consistently contributed a significant portion of the baseline revenue, leading Gadgets and Widgets. While the other categories remained fairly consistent throughout the year, the solo spike in November 2023, is what causes this category to be considered unstable over time. The “Top 5 Items by Total Revenue” plot confirms this as well. The top item “Cable” is an accessory, and accounts for nearly the entire total revenue for that month, at approximately $12,000.
#I didn’t use AI to improve the plot, but for this question I asked ChatGPT “how do I color code by item?” It shared that I need to map fill by item. So I added map fill = item inside the aes() of my ggplot call. Below is my updated code. I removed the additional legen on the righ too.
ggplot(top_5_items, aes(x = reorder(item, Total_Revenue),
y = Total_Revenue, fill = item)) +
geom_col() +
coord_flip() +
labs(
title = "Top 5 Items by Total Revenue", x = "Item", y = "Total Revenue (USD)"
) + theme_minimal() +
theme(legend.position = "none")