assignment--4-daniel-H.knit

# Load necessary libraries
#tidyverse used for data manipulation, visualization, and analysis.
#readr is used for reading csv files
#readr: A package within tidyverse used for reading CSV files.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)

# Step 1: Read CSV file, note there are no column names
#we also store the data in "arrival_data"
arrival_data <- read_csv("arrival_delays.csv", col_names = FALSE)

## Rows: 6 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): X1, X2, X3, X4, X5, X6, X7
## lgl (1): X8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Step 2: Assign proper column names
#the datset includes the airline name, the status of the flights (on-time or delayed, the cities that the planes flew out of and an extra column that i need to get rid of later)

colnames(arrival_data) <- c("Airline", "Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle", "Extra_Column")

# Step 3: Remove the extra column ( I kept having an NA column which messed up my result) Keeps only the first 7 columns, discarding the unnecessary Extra_Column.
arrival_data <- arrival_data[, 1:7]  # Keep only valid columns

# ✅ Step 4: Remove completely empty rows BEFORE pivoting
#Removes rows where all columns contain NA (missing values).

arrival_data <- arrival_data %>%
  filter(!if_all(everything(), is.na))  # Removes rows where ALL columns are NA

# ✅ Step 5: Fix missing Airline names (ensuring delayed flights keep their airline)
#Uses the last known Airline value to fill missing Airline names downward. We had some gaps in the excel with regards to names of the airlines so we have to fill them in.
arrival_data <- arrival_data %>%
  fill(Airline, .direction = "down")  # Fill missing airline names

# Step 6: Convert to a tidy format (long format)

#Converts the dataset from wide format to long format: Before Pivoting: Columns for each city / After Pivoting: One City column, with corresponding Flights count.

tidy_data <- arrival_data %>%
  pivot_longer(cols = c("Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle"),
               names_to = "City",
               values_to = "Flights") %>%
  arrange(Airline, Status, City)

# 7: Remove remaining NA rows after pivoting, I kept having issues so I added this, b/c this ensures that no NA values remain in the key columns like Airline, Status and Flights
tidy_data <- tidy_data %>%
  filter(!is.na(Airline) & !is.na(Status) & !is.na(Flights))

# Step 8: Convert Status and Flights to the correct format so we are totally consistent
tidy_data <- tidy_data %>%
  mutate(Status = tolower(Status),
         Flights = as.numeric(Flights))  # Ensures Flights is numeric

# Step 9: Print cleaned dataset just to check that my data is clean and we display the first 30 rows
print(tidy_data, n = 30)

## # A tibble: 20 × 4
##    Airline Status  City          Flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 ALASKA  delayed Los_Angeles        62
##  2 ALASKA  delayed Phoenix            12
##  3 ALASKA  delayed San_Diego          20
##  4 ALASKA  delayed San_Francisco     102
##  5 ALASKA  delayed Seattle           305
##  6 ALASKA  on time Los_Angeles       497
##  7 ALASKA  on time Phoenix           221
##  8 ALASKA  on time San_Diego         212
##  9 ALASKA  on time San_Francisco     503
## 10 ALASKA  on time Seattle          1841
## 11 AM WEST delayed Los_Angeles       117
## 12 AM WEST delayed Phoenix           415
## 13 AM WEST delayed San_Diego          65
## 14 AM WEST delayed San_Francisco     129
## 15 AM WEST delayed Seattle            61
## 16 AM WEST on time Los_Angeles       694
## 17 AM WEST on time Phoenix          4840
## 18 AM WEST on time San_Diego         383
## 19 AM WEST on time San_Francisco     320
## 20 AM WEST on time Seattle           201

#  1. Summary Statistics: Compare On-time vs Delayed Flights per Airline

#Groups data by airline and status (on-time vs delayeed)
#Calculates total flights per group and uses na.rm to ignore missing values
#
summary_stats <- tidy_data %>%
  group_by(Airline, Status) %>%
  summarise(Total_Flights = sum(Flights, na.rm = TRUE), .groups = "drop")

# Print summary statistics, this shows us the total number of delayed / on time flights for Alaska and AM West, what really matters though is the delay percentage which we will discuss a little later on.
print(summary_stats)

## # A tibble: 4 × 3
##   Airline Status  Total_Flights
##   <chr>   <chr>           <dbl>
## 1 ALASKA  delayed           501
## 2 ALASKA  on time          3274
## 3 AM WEST delayed           787
## 4 AM WEST on time          6438

#  2. Calculate Delay Percentage for Each Airline
delay_percentage <- summary_stats %>%
  pivot_wider(names_from = Status, values_from = Total_Flights, values_fill = list(Total_Flights = 0)) %>%
  mutate(Delay_Percentage = delayed / (delayed + `on time`) * 100)

# Print delay percentage table
print(delay_percentage)

## # A tibble: 2 × 4
##   Airline delayed `on time` Delay_Percentage
##   <chr>     <dbl>     <dbl>            <dbl>
## 1 ALASKA      501      3274             13.3
## 2 AM WEST     787      6438             10.9

#We can see that Alaska has a 13% delay percentage vs 11% for AM West. Clearly AM West is the better airline as it's delay percentage is lower.