# Load necessary libraries
#tidyverse used for data manipulation, visualization, and analysis.
#readr is used for reading csv files
#readr: A package within tidyverse used for reading CSV files.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
# Step 1: Read CSV file, note there are no column names
#we also store the data in "arrival_data"
arrival_data <- read_csv("arrival_delays.csv", col_names = FALSE)
## Rows: 6 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): X1, X2, X3, X4, X5, X6, X7
## lgl (1): X8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Assign proper column names
#the datset includes the airline name, the status of the flights (on-time or delayed, the cities that the planes flew out of and an extra column that i need to get rid of later)
colnames(arrival_data) <- c("Airline", "Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle", "Extra_Column")
# Step 3: Remove the extra column ( I kept having an NA column which messed up my result) Keeps only the first 7 columns, discarding the unnecessary Extra_Column.
arrival_data <- arrival_data[, 1:7] # Keep only valid columns
# ✅ Step 4: Remove completely empty rows BEFORE pivoting
#Removes rows where all columns contain NA (missing values).
arrival_data <- arrival_data %>%
filter(!if_all(everything(), is.na)) # Removes rows where ALL columns are NA
# ✅ Step 5: Fix missing Airline names (ensuring delayed flights keep their airline)
#Uses the last known Airline value to fill missing Airline names downward. We had some gaps in the excel with regards to names of the airlines so we have to fill them in.
arrival_data <- arrival_data %>%
fill(Airline, .direction = "down") # Fill missing airline names
# Step 6: Convert to a tidy format (long format)
#Converts the dataset from wide format to long format: Before Pivoting: Columns for each city / After Pivoting: One City column, with corresponding Flights count.
tidy_data <- arrival_data %>%
pivot_longer(cols = c("Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle"),
names_to = "City",
values_to = "Flights") %>%
arrange(Airline, Status, City)
# 7: Remove remaining NA rows after pivoting, I kept having issues so I added this, b/c this ensures that no NA values remain in the key columns like Airline, Status and Flights
tidy_data <- tidy_data %>%
filter(!is.na(Airline) & !is.na(Status) & !is.na(Flights))
# Step 8: Convert Status and Flights to the correct format so we are totally consistent
tidy_data <- tidy_data %>%
mutate(Status = tolower(Status),
Flights = as.numeric(Flights)) # Ensures Flights is numeric
# Step 9: Print cleaned dataset just to check that my data is clean and we display the first 30 rows
print(tidy_data, n = 30)
## # A tibble: 20 × 4
## Airline Status City Flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA delayed Los_Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San_Diego 20
## 4 ALASKA delayed San_Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 ALASKA on time Los_Angeles 497
## 7 ALASKA on time Phoenix 221
## 8 ALASKA on time San_Diego 212
## 9 ALASKA on time San_Francisco 503
## 10 ALASKA on time Seattle 1841
## 11 AM WEST delayed Los_Angeles 117
## 12 AM WEST delayed Phoenix 415
## 13 AM WEST delayed San_Diego 65
## 14 AM WEST delayed San_Francisco 129
## 15 AM WEST delayed Seattle 61
## 16 AM WEST on time Los_Angeles 694
## 17 AM WEST on time Phoenix 4840
## 18 AM WEST on time San_Diego 383
## 19 AM WEST on time San_Francisco 320
## 20 AM WEST on time Seattle 201
# 1. Summary Statistics: Compare On-time vs Delayed Flights per Airline
#Groups data by airline and status (on-time vs delayeed)
#Calculates total flights per group and uses na.rm to ignore missing values
#
summary_stats <- tidy_data %>%
group_by(Airline, Status) %>%
summarise(Total_Flights = sum(Flights, na.rm = TRUE), .groups = "drop")
# Print summary statistics, this shows us the total number of delayed / on time flights for Alaska and AM West, what really matters though is the delay percentage which we will discuss a little later on.
print(summary_stats)
## # A tibble: 4 × 3
## Airline Status Total_Flights
## <chr> <chr> <dbl>
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
# 2. Calculate Delay Percentage for Each Airline
delay_percentage <- summary_stats %>%
pivot_wider(names_from = Status, values_from = Total_Flights, values_fill = list(Total_Flights = 0)) %>%
mutate(Delay_Percentage = delayed / (delayed + `on time`) * 100)
# Print delay percentage table
print(delay_percentage)
## # A tibble: 2 × 4
## Airline delayed `on time` Delay_Percentage
## <chr> <dbl> <dbl> <dbl>
## 1 ALASKA 501 3274 13.3
## 2 AM WEST 787 6438 10.9
#We can see that Alaska has a 13% delay percentage vs 11% for AM West. Clearly AM West is the better airline as it's delay percentage is lower.