#Assignment 4
#By Jose Fuentes
#1rst and 2nd Part
# Load the necessary library that contains tidyr and dplyr
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the CSV file using path
data <- read.csv("C:/Users/Dell/Downloads/assignment4.csv")
# Check the initial structure and column names
print("Initial Column Names:")
## [1] "Initial Column Names:"
print(colnames(data))
## [1] "X" "X.1" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle" "X.2"
## [9] "X.3" "X.4"
print("Structure of the Data:")
## [1] "Structure of the Data:"
str(data)
## 'data.frame': 13 obs. of 10 variables:
## $ X : chr "ALASKA" "ALASKA" "AM WEST" "AM WEST" ...
## $ X.1 : chr "on time" "delayed" "on time" "delayed" ...
## $ Los.Angeles : int 497 62 694 117 NA NA NA NA NA NA ...
## $ Phoenix : int 221 12 4840 415 NA NA NA NA NA NA ...
## $ San.Diego : int 212 20 385 65 NA NA NA NA NA NA ...
## $ San.Francisco: int 503 102 320 129 NA NA NA NA NA NA ...
## $ Seattle : int 1841 305 201 61 NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ X.4 : logi NA NA NA NA NA NA ...
# Display the first few rows of the original dataset
print("First Few Rows of the Original Data:")
## [1] "First Few Rows of the Original Data:"
print(head(data))
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle X.2 X.3
## 1 ALASKA on time 497 221 212 503 1841 NA NA
## 2 ALASKA delayed 62 12 20 102 305 NA NA
## 3 AM WEST on time 694 4840 385 320 201 NA NA
## 4 AM WEST delayed 117 415 65 129 61 NA NA
## 5 NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA
## X.4
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
# Check for any NA values present in the dataset
print("Summary of NA Values:")
## [1] "Summary of NA Values:"
print(summary(data))
## X X.1 Los.Angeles Phoenix
## Length:13 Length:13 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## NA's :9 NA's :9
## San.Diego San.Francisco Seattle X.2 X.3
## Min. : 20.00 Min. :102.0 Min. : 61 Mode:logical Mode:logical
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166 NA's:13 NA's:13
## Median :138.50 Median :224.5 Median : 253
## Mean :170.50 Mean :263.5 Mean : 602
## 3rd Qu.:255.25 3rd Qu.:365.8 3rd Qu.: 689
## Max. :385.00 Max. :503.0 Max. :1841
## NA's :9 NA's :9 NA's :9
## X.4
## Mode:logical
## NA's:13
##
##
##
##
##
# Check the dimensions of the data
print("Dimensions of the Data:")
## [1] "Dimensions of the Data:"
print(dim(data))
## [1] 13 10
# Clean and tidy the dataset:
data <- data %>%
# Remove rows that are all NA
filter(rowSums(is.na(.)) < ncol(.)) %>%
# Select relevant columns with the correct original names
select(Airline = X, Status = X.1,
Los_Angeles = `Los.Angeles`,
Phoenix,
San_Diego = `San.Diego`,
San_Francisco = `San.Francisco`,
Seattle) %>%
# Convert the relevant city columns to numeric values:
mutate(across(Los_Angeles:Seattle, as.numeric)) %>%
# Filter for only on-time flights
filter(Status == "on time") %>%
# Reshape data into a longer format
pivot_longer(cols = Los_Angeles:Seattle, names_to = "City", values_to = "Count")
# Display the cleaned data
print("Cleaned Data:")
## [1] "Cleaned Data:"
print(head(data))
## # A tibble: 6 × 4
## Airline Status City Count
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los_Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San_Diego 212
## 4 ALASKA on time San_Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 AM WEST on time Los_Angeles 694
#Part3
# Loading the necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# Reading the CSV file
data <- read.csv("C:/Users/Dell/Downloads/assignment4.csv")
# Clean and tidy the dataset:
data <- data %>%
filter(rowSums(is.na(.)) < ncol(.)) %>% # Remove rows with all NA values
select(-X) %>% # Remove the unnecessary 'X' column
rename(
Airline = `X.1`,
Los_Angeles = `Los.Angeles`,
Phoenix = `Phoenix`,
San_Diego = `San.Diego`,
San_Francisco = `San.Francisco`,
Seattle = `Seattle`
) %>%
mutate(across(Los_Angeles:Seattle, as.numeric)) %>%
filter(!is.na(Airline), Airline != "") # Remove rows with NA or empty Airlines
# Verify the cleaned data and column names
print("Cleaned Data:")
## [1] "Cleaned Data:"
print(head(data))
## Airline Los_Angeles Phoenix San_Diego San_Francisco Seattle X.2 X.3 X.4
## 1 on time 497 221 212 503 1841 NA NA NA
## 2 delayed 62 12 20 102 305 NA NA NA
## 3 on time 694 4840 385 320 201 NA NA NA
## 4 delayed 117 415 65 129 61 NA NA NA
print("Column Names After Cleaning:")
## [1] "Column Names After Cleaning:"
print(colnames(data))
## [1] "Airline" "Los_Angeles" "Phoenix" "San_Diego"
## [5] "San_Francisco" "Seattle" "X.2" "X.3"
## [9] "X.4"
# Filter for delayed flights:
delayed_flights <- data %>%
filter(grepl("delayed", Airline)) # Ensure we capture 'delayed'
# Debugging: Check delayed flights
print("Delayed Flights Data:")
## [1] "Delayed Flights Data:"
print(head(delayed_flights))
## Airline Los_Angeles Phoenix San_Diego San_Francisco Seattle X.2 X.3 X.4
## 1 delayed 62 12 20 102 305 NA NA NA
## 2 delayed 117 415 65 129 61 NA NA NA
print(paste("Number of delayed flights:", nrow(delayed_flights)))
## [1] "Number of delayed flights: 2"
# Check the column names again for delayed flights
print("Column Names in Delayed Flights Data:")
## [1] "Column Names in Delayed Flights Data:"
print(colnames(delayed_flights))
## [1] "Airline" "Los_Angeles" "Phoenix" "San_Diego"
## [5] "San_Francisco" "Seattle" "X.2" "X.3"
## [9] "X.4"
# Summarize the arrival delays for each airline
delay_summary <- delayed_flights %>%
group_by(Airline) %>%
summarize(Total_Delays = sum(c_across(Los_Angeles:Seattle), na.rm = TRUE), .groups = 'drop')
# Debugging: Check the delay summary
print("Arrival Delays Summary:")
## [1] "Arrival Delays Summary:"
print(delay_summary)
## # A tibble: 1 × 2
## Airline Total_Delays
## <chr> <dbl>
## 1 delayed 1288
# Now Create a bar plot to compare delays if there are valid delays in dataset:
if(nrow(delay_summary) > 0 && all(!is.na(delay_summary$Total_Delays) & delay_summary$Total_Delays > 0)) {
ggplot(delay_summary, aes(x = Airline, y = Total_Delays, fill = Airline)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Arrival Delays for Airlines",
x = "Airline",
y = "Total Delays") +
theme_minimal()
} else {
print("No valid delays to display in the plot.")
}
