Loading the libraries:

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Question 1:

Firstly, I enter the data in a excel file and save it as CSV file.

Questions 2:

Read the the raw data:

Untidy_data <- read.csv("C:/Users/zahid/OneDrive/Desktop/Assignment 5/Tidy and transforming dataset.csv")

# Assign column names to the matrix
colnames(Untidy_data) <- c("Airline", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")

Convert the matrix to a data frame:

Tidy_data <- as.data.frame(Untidy_data)

# View the data frame
print(Tidy_data)
##   Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA On Time         497     221       212           503    1841
## 2  ALASKA Delayed          62      12        20           102     305
## 3 AM WEST On Time         694    4840       383           320     201
## 4 AM WEST Delayed         117     415        65           129      61

Question 3:

# Step 1: Filter the data to include only delayed flights
delayed_data <- Tidy_data %>%
  filter(Status == "Delayed")

print(delayed_data)
##   Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA Delayed          62      12        20           102     305
## 2 AM WEST Delayed         117     415        65           129      61

Summarize the delayed flights for each airline

summary_delayed <- delayed_data %>%
  group_by(Airline) %>%
  summarise(
    Total_Delayed = sum(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`, na.rm = TRUE),
    Mean_Delay = mean(c(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`), na.rm = TRUE),
    Median_Delay = median(c(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`), na.rm = TRUE),
    .groups = "drop"
  )

# Print the summary
print(summary_delayed)
## # A tibble: 2 × 4
##   Airline Total_Delayed Mean_Delay Median_Delay
##   <chr>           <int>      <dbl>        <int>
## 1 ALASKA            501       100.           62
## 2 AM WEST           787       157.          117

Step 3: Visualize the comparison of delayed flights by airline

ggplot(delayed_data, aes(x = Airline, y = `Los Angeles` + `Phoenix` + `San Diego` + `San Francisco` + `Seattle`, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Comparison of Arrival Delays for Airlines",
    x = "Airline",
    y = "Total Number of Delayed Flights"
  ) +
  theme_minimal()

ggplot(summary_delayed, aes(x = Airline, y = Mean_Delay, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Mean Arrival Delay for Airlines",
    x = "Airline",
    y = "Mean Delay (Minutes)"
  ) +
  theme_minimal()