#Assignment 4 
#By Jose Fuentes

#1rst and 2nd Part
# Load the necessary library that contains tidyr and dplyr
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the CSV file using path
data <- read.csv("C:/Users/Dell/Downloads/assignment4.csv")

# Check the initial structure and column names
print("Initial Column Names:")
## [1] "Initial Column Names:"
print(colnames(data))
##  [1] "X"             "X.1"           "Los.Angeles"   "Phoenix"      
##  [5] "San.Diego"     "San.Francisco" "Seattle"       "X.2"          
##  [9] "X.3"           "X.4"
print("Structure of the Data:")
## [1] "Structure of the Data:"
str(data)
## 'data.frame':    13 obs. of  10 variables:
##  $ X            : chr  "ALASKA" "ALASKA" "AM WEST" "AM WEST" ...
##  $ X.1          : chr  "on time" "delayed" "on time" "delayed" ...
##  $ Los.Angeles  : int  497 62 694 117 NA NA NA NA NA NA ...
##  $ Phoenix      : int  221 12 4840 415 NA NA NA NA NA NA ...
##  $ San.Diego    : int  212 20 385 65 NA NA NA NA NA NA ...
##  $ San.Francisco: int  503 102 320 129 NA NA NA NA NA NA ...
##  $ Seattle      : int  1841 305 201 61 NA NA NA NA NA NA ...
##  $ X.2          : logi  NA NA NA NA NA NA ...
##  $ X.3          : logi  NA NA NA NA NA NA ...
##  $ X.4          : logi  NA NA NA NA NA NA ...
# Display the first few rows of the original dataset
print("First Few Rows of the Original Data:")
## [1] "First Few Rows of the Original Data:"
print(head(data))
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle X.2 X.3
## 1  ALASKA on time         497     221       212           503    1841  NA  NA
## 2  ALASKA delayed          62      12        20           102     305  NA  NA
## 3 AM WEST on time         694    4840       385           320     201  NA  NA
## 4 AM WEST delayed         117     415        65           129      61  NA  NA
## 5                          NA      NA        NA            NA      NA  NA  NA
## 6                          NA      NA        NA            NA      NA  NA  NA
##   X.4
## 1  NA
## 2  NA
## 3  NA
## 4  NA
## 5  NA
## 6  NA
# Check for any NA values present in the dataset
print("Summary of NA Values:")
## [1] "Summary of NA Values:"
print(summary(data))
##       X                 X.1             Los.Angeles       Phoenix      
##  Length:13          Length:13          Min.   : 62.0   Min.   :  12.0  
##  Class :character   Class :character   1st Qu.:103.2   1st Qu.: 168.8  
##  Mode  :character   Mode  :character   Median :307.0   Median : 318.0  
##                                        Mean   :342.5   Mean   :1372.0  
##                                        3rd Qu.:546.2   3rd Qu.:1521.2  
##                                        Max.   :694.0   Max.   :4840.0  
##                                        NA's   :9       NA's   :9       
##    San.Diego      San.Francisco      Seattle       X.2            X.3         
##  Min.   : 20.00   Min.   :102.0   Min.   :  61   Mode:logical   Mode:logical  
##  1st Qu.: 53.75   1st Qu.:122.2   1st Qu.: 166   NA's:13        NA's:13       
##  Median :138.50   Median :224.5   Median : 253                                
##  Mean   :170.50   Mean   :263.5   Mean   : 602                                
##  3rd Qu.:255.25   3rd Qu.:365.8   3rd Qu.: 689                                
##  Max.   :385.00   Max.   :503.0   Max.   :1841                                
##  NA's   :9        NA's   :9       NA's   :9                                   
##    X.4         
##  Mode:logical  
##  NA's:13       
##                
##                
##                
##                
## 
# Check the dimensions of the data
print("Dimensions of the Data:")
## [1] "Dimensions of the Data:"
print(dim(data))
## [1] 13 10
# Clean and tidy the dataset:
data <- data %>%
  # Remove rows that are all NA
  filter(rowSums(is.na(.)) < ncol(.)) %>%
  # Select relevant columns with the correct original names
  select(Airline = X, Status = X.1, 
         Los_Angeles = `Los.Angeles`, 
         Phoenix, 
         San_Diego = `San.Diego`, 
         San_Francisco = `San.Francisco`, 
         Seattle) %>%
  # Convert the relevant city columns to numeric values:
  mutate(across(Los_Angeles:Seattle, as.numeric)) %>%
  # Filter for only on-time flights
  filter(Status == "on time") %>%
  # Reshape data into a longer format
  pivot_longer(cols = Los_Angeles:Seattle, names_to = "City", values_to = "Count")

# Display the cleaned data
print("Cleaned Data:")
## [1] "Cleaned Data:"
print(head(data))
## # A tibble: 6 × 4
##   Airline Status  City          Count
##   <chr>   <chr>   <chr>         <dbl>
## 1 ALASKA  on time Los_Angeles     497
## 2 ALASKA  on time Phoenix         221
## 3 ALASKA  on time San_Diego       212
## 4 ALASKA  on time San_Francisco   503
## 5 ALASKA  on time Seattle        1841
## 6 AM WEST on time Los_Angeles     694
#Part3
# Loading the necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# Reading the CSV file
data <- read.csv("C:/Users/Dell/Downloads/assignment4.csv")

# Clean and tidy the dataset:
data <- data %>%
  filter(rowSums(is.na(.)) < ncol(.)) %>%  # Remove rows with all NA values
  select(-X) %>%  # Remove the unnecessary 'X' column
  rename(
    Airline = `X.1`,  
    Los_Angeles = `Los.Angeles`,
    Phoenix = `Phoenix`,
    San_Diego = `San.Diego`,
    San_Francisco = `San.Francisco`,
    Seattle = `Seattle`
  ) %>%
  mutate(across(Los_Angeles:Seattle, as.numeric)) %>%
  filter(!is.na(Airline), Airline != "")  # Remove rows with NA or empty Airlines

# Verify the cleaned data and column names
print("Cleaned Data:")
## [1] "Cleaned Data:"
print(head(data))
##   Airline Los_Angeles Phoenix San_Diego San_Francisco Seattle X.2 X.3 X.4
## 1 on time         497     221       212           503    1841  NA  NA  NA
## 2 delayed          62      12        20           102     305  NA  NA  NA
## 3 on time         694    4840       385           320     201  NA  NA  NA
## 4 delayed         117     415        65           129      61  NA  NA  NA
print("Column Names After Cleaning:")
## [1] "Column Names After Cleaning:"
print(colnames(data))
## [1] "Airline"       "Los_Angeles"   "Phoenix"       "San_Diego"    
## [5] "San_Francisco" "Seattle"       "X.2"           "X.3"          
## [9] "X.4"
# Filter for delayed flights:
delayed_flights <- data %>%
  filter(grepl("delayed", Airline))  # Ensure we capture 'delayed'

# Debugging: Check delayed flights
print("Delayed Flights Data:")
## [1] "Delayed Flights Data:"
print(head(delayed_flights))
##   Airline Los_Angeles Phoenix San_Diego San_Francisco Seattle X.2 X.3 X.4
## 1 delayed          62      12        20           102     305  NA  NA  NA
## 2 delayed         117     415        65           129      61  NA  NA  NA
print(paste("Number of delayed flights:", nrow(delayed_flights)))
## [1] "Number of delayed flights: 2"
# Check the column names again for delayed flights
print("Column Names in Delayed Flights Data:")
## [1] "Column Names in Delayed Flights Data:"
print(colnames(delayed_flights))
## [1] "Airline"       "Los_Angeles"   "Phoenix"       "San_Diego"    
## [5] "San_Francisco" "Seattle"       "X.2"           "X.3"          
## [9] "X.4"
# Summarize the arrival delays for each airline
delay_summary <- delayed_flights %>%
  group_by(Airline) %>%
  summarize(Total_Delays = sum(c_across(Los_Angeles:Seattle), na.rm = TRUE), .groups = 'drop')

# Debugging: Check the delay summary
print("Arrival Delays Summary:")
## [1] "Arrival Delays Summary:"
print(delay_summary)
## # A tibble: 1 × 2
##   Airline Total_Delays
##   <chr>          <dbl>
## 1 delayed         1288
# Now Create a bar plot to compare delays if there are valid delays in dataset:
if(nrow(delay_summary) > 0 && all(!is.na(delay_summary$Total_Delays) & delay_summary$Total_Delays > 0)) {
  ggplot(delay_summary, aes(x = Airline, y = Total_Delays, fill = Airline)) +
    geom_bar(stat = "identity") +
    labs(title = "Comparison of Arrival Delays for Airlines",
         x = "Airline",
         y = "Total Delays") +
    theme_minimal()
} else {
  print("No valid delays to display in the plot.")
}