Open csv file from github

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)


Flights <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/assignment-05/main/DATA605_05.csv")
head(Flights)
##         X     X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497    221       212           503    1841
## 2         delayed          62     12        20           102     305
## 3                          NA     NA        NA            NA      NA
## 4 AM WEST on time         694   4840       383           320     201
## 5         delayed         117    415        65           129      61
Flights_clean <- na.omit(Flights)
head(Flights_clean)
##         X     X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497    221       212           503    1841
## 2         delayed          62     12        20           102     305
## 4 AM WEST on time         694   4840       383           320     201
## 5         delayed         117    415        65           129      61

Add the Airlines name to the missing rows

# Duplicate Alaska into the second row
Flights_clean[2, "X"] <- Flights_clean[1, "X"]

# Duplicate AM WEST into the fourth row
Flights_clean[4, "X"] <- Flights_clean[3, "X"]

# Display the modified dataset
Flights_clean
##         X     X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497    221       212           503    1841
## 2  ALASKA delayed          62     12        20           102     305
## 4 AM WEST on time         694   4840       383           320     201
## 5 AM WEST delayed         117    415        65           129      61

COnvert the coloumns of cities into rows and values into one column

# Convert columns 3-7 into rows
Flights_data <- Flights_clean %>%
  pivot_longer(cols = c(Los.Angeles:Seattle),
               names_to = "City",
               values_to = "Value")

# Display the modified dataset
Flights_data
## # A tibble: 20 × 4
##    X       X.1     City          Value
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los.Angeles     497
##  2 ALASKA  on time Phenix          221
##  3 ALASKA  on time San.Diego       212
##  4 ALASKA  on time San.Francisco   503
##  5 ALASKA  on time Seattle        1841
##  6 ALASKA  delayed Los.Angeles      62
##  7 ALASKA  delayed Phenix           12
##  8 ALASKA  delayed San.Diego        20
##  9 ALASKA  delayed San.Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AM WEST on time Los.Angeles     694
## 12 AM WEST on time Phenix         4840
## 13 AM WEST on time San.Diego       383
## 14 AM WEST on time San.Francisco   320
## 15 AM WEST on time Seattle         201
## 16 AM WEST delayed Los.Angeles     117
## 17 AM WEST delayed Phenix          415
## 18 AM WEST delayed San.Diego        65
## 19 AM WEST delayed San.Francisco   129
## 20 AM WEST delayed Seattle          61

Rename first two columns

Flights_data <- Flights_data %>%
  rename(
   Airline = X,
   Status = X.1,
   Amount = Value)

Flights_data
## # A tibble: 20 × 4
##    Airline Status  City          Amount
##    <chr>   <chr>   <chr>          <int>
##  1 ALASKA  on time Los.Angeles      497
##  2 ALASKA  on time Phenix           221
##  3 ALASKA  on time San.Diego        212
##  4 ALASKA  on time San.Francisco    503
##  5 ALASKA  on time Seattle         1841
##  6 ALASKA  delayed Los.Angeles       62
##  7 ALASKA  delayed Phenix            12
##  8 ALASKA  delayed San.Diego         20
##  9 ALASKA  delayed San.Francisco    102
## 10 ALASKA  delayed Seattle          305
## 11 AM WEST on time Los.Angeles      694
## 12 AM WEST on time Phenix          4840
## 13 AM WEST on time San.Diego        383
## 14 AM WEST on time San.Francisco    320
## 15 AM WEST on time Seattle          201
## 16 AM WEST delayed Los.Angeles      117
## 17 AM WEST delayed Phenix           415
## 18 AM WEST delayed San.Diego         65
## 19 AM WEST delayed San.Francisco    129
## 20 AM WEST delayed Seattle           61

Create a bar chart of delayed to total by airline in %

Flights_data <- Flights_data %>%
  group_by( Airline) %>%
  mutate(Percentage = Amount / sum(Amount) * 100)

ggplot(data= Flights_data, aes(x= Airline, y = Percentage, fill = Status))+
  geom_bar(stat = "identity")

# Find the city with the most delays for each airline
Most_delayed_city <- Flights_data %>%
  filter(Status == "delayed") %>%
  group_by(Airline, City) %>%
  summarise(Total_delays = Amount) %>%
  top_n(3, Total_delays)  # Select the city with the most delays for each airline
## `summarise()` has grouped output by 'Airline'. You can override using the
## `.groups` argument.
Most_delayed_city
## # A tibble: 6 × 3
## # Groups:   Airline [2]
##   Airline City          Total_delays
##   <chr>   <chr>                <int>
## 1 ALASKA  Los.Angeles             62
## 2 ALASKA  San.Francisco          102
## 3 ALASKA  Seattle                305
## 4 AM WEST Los.Angeles            117
## 5 AM WEST Phenix                 415
## 6 AM WEST San.Francisco          129
# Create the bar chart
ggplot(data = Most_delayed_city, aes(x = City, y = Total_delays, fill = Airline)) +
  geom_bar(stat = "identity") +
  labs(title = "Most Delayed City by Airline",
       x = "City",
       y = "Amount") +
  theme_minimal()