Loading the libraries:
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
Question 1:
Firstly, I enter the data in a excel file and save it as CSV
file.
Questions 2:
Read the the raw data:
Untidy_data <- read.csv("C:/Users/zahid/OneDrive/Desktop/Assignment 5/Tidy and transforming dataset.csv")
# Assign column names to the matrix
colnames(Untidy_data) <- c("Airline", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")
Convert the matrix to a data frame:
Tidy_data <- as.data.frame(Untidy_data)
# View the data frame
print(Tidy_data)
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA On Time 497 221 212 503 1841
## 2 ALASKA Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 AM WEST Delayed 117 415 65 129 61
Question 3:
# Step 1: Filter the data to include only delayed flights
delayed_data <- Tidy_data %>%
filter(Status == "Delayed")
print(delayed_data)
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA Delayed 62 12 20 102 305
## 2 AM WEST Delayed 117 415 65 129 61
Summarize the delayed flights for each airline
summary_delayed <- delayed_data %>%
group_by(Airline) %>%
summarise(
Total_Delayed = sum(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`, na.rm = TRUE),
Mean_Delay = mean(c(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`), na.rm = TRUE),
Median_Delay = median(c(`Los Angeles`, `Phoenix`, `San Diego`, `San Francisco`, `Seattle`), na.rm = TRUE),
.groups = "drop"
)
# Print the summary
print(summary_delayed)
## # A tibble: 2 × 4
## Airline Total_Delayed Mean_Delay Median_Delay
## <chr> <int> <dbl> <int>
## 1 ALASKA 501 100. 62
## 2 AM WEST 787 157. 117
Step 3: Visualize the comparison of delayed flights by airline
ggplot(delayed_data, aes(x = Airline, y = `Los Angeles` + `Phoenix` + `San Diego` + `San Francisco` + `Seattle`, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Comparison of Arrival Delays for Airlines",
x = "Airline",
y = "Total Number of Delayed Flights"
) +
theme_minimal()

ggplot(summary_delayed, aes(x = Airline, y = Mean_Delay, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Mean Arrival Delay for Airlines",
x = "Airline",
y = "Mean Delay (Minutes)"
) +
theme_minimal()
