Loading the necessary packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readr)
library(gt)

Creating the data frame:

I tried to create the data in the R markdown just to see if I can do it:

delays <- data.frame(airlines = c("ALASKA", "", "", "AM_WEST", ""),
                     status = c("On_time", "Delayed", "", "On_time", "Delayed"),
                     Los_Angeles = c(497, 62, NA, 694, 117), 
                     Phoenix = c(221, 12, NA, 4840, 415), 
                     San_Diego =c(212, 20, NA, 383, 65), 
                     San_Francisco = c(503, 102, NA, 320, 129), 
                     Seattle = c(1841, 305, NA, 201, 61))
delays
##   airlines  status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1   ALASKA On_time         497     221       212           503    1841
## 2          Delayed          62      12        20           102     305
## 3                           NA      NA        NA            NA      NA
## 4  AM_WEST On_time         694    4840       383           320     201
## 5          Delayed         117     415        65           129      61
delays1 <- gt(delays)
delays1
airlines status Los_Angeles Phoenix San_Diego San_Francisco Seattle
ALASKA On_time 497 221 212 503 1841
Delayed 62 12 20 102 305
NA NA NA NA NA
AM_WEST On_time 694 4840 383 320 201
Delayed 117 415 65 129 61
gt_tbl <- 
  delays |>
  gt(rowname_col = "airlines")
gt_tbl
status Los_Angeles Phoenix San_Diego San_Francisco Seattle
ALASKA On_time 497 221 212 503 1841
Delayed 62 12 20 102 305
NA NA NA NA NA
AM_WEST On_time 694 4840 383 320 201
Delayed 117 415 65 129 61
write.csv(delays, file = "delays.csv")

Loading the data as .csv file:

url_de <- "https://raw.githubusercontent.com/SalouaDaouki/Data607/main/airlinesDelays.csv"
Air_delays <- read.csv(file = url_de )
Air_delays
##         X     X.1 Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1  ALASKA on time         497     221       212           503    1840
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Transforming the data:

colnames(Air_delays)[1] <- "airlines"
colnames(Air_delays)[2] <- "Status"
Air_delays
##   airlines  Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1   ALASKA on time         497     221       212           503    1840
## 2          delayed          62      12        20           102     305
## 3                           NA      NA        NA            NA      NA
## 4  AM WEST on time         694    4840       383           320     201
## 5          delayed         117     415        65           129      61
Air_delays <- drop_na(Air_delays)
Air_delays
##   airlines  Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1   ALASKA on time         497     221       212           503    1840
## 2          delayed          62      12        20           102     305
## 3  AM WEST on time         694    4840       383           320     201
## 4          delayed         117     415        65           129      61
Air_delays[Air_delays==""]<-NA
Air_delays <- Air_delays %>% fill(airlines, .direction = 'down')
Air_delays
##   airlines  Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1   ALASKA on time         497     221       212           503    1840
## 2   ALASKA delayed          62      12        20           102     305
## 3  AM WEST on time         694    4840       383           320     201
## 4  AM WEST delayed         117     415        65           129      61

Tidying the data:

pivot_longer(data = data.frame, cols = columns.to.pivot, names_to = “New Column Name”, values_to = “New Column Name”)

Air_delays_long <- Air_delays %>% 
  pivot_longer(
    cols = !c("airlines","Status"), 
    names_to = "Destination", 
    values_to = "Status_freq",
    values_drop_na = TRUE,
  )
Air_delays_long
## # A tibble: 20 × 4
##    airlines Status  Destination   Status_freq
##    <chr>    <chr>   <chr>               <int>
##  1 ALASKA   on time Los.Angeles           497
##  2 ALASKA   on time Phoenix               221
##  3 ALASKA   on time San.Diego             212
##  4 ALASKA   on time San.Frnacisco         503
##  5 ALASKA   on time Seattle              1840
##  6 ALASKA   delayed Los.Angeles            62
##  7 ALASKA   delayed Phoenix                12
##  8 ALASKA   delayed San.Diego              20
##  9 ALASKA   delayed San.Frnacisco         102
## 10 ALASKA   delayed Seattle               305
## 11 AM WEST  on time Los.Angeles           694
## 12 AM WEST  on time Phoenix              4840
## 13 AM WEST  on time San.Diego             383
## 14 AM WEST  on time San.Frnacisco         320
## 15 AM WEST  on time Seattle               201
## 16 AM WEST  delayed Los.Angeles           117
## 17 AM WEST  delayed Phoenix               415
## 18 AM WEST  delayed San.Diego              65
## 19 AM WEST  delayed San.Frnacisco         129
## 20 AM WEST  delayed Seattle                61
Air_delays_long$Destination <- str_replace_all(Air_delays_long$Destination, "\\.", " ")
Air_delays_long
## # A tibble: 20 × 4
##    airlines Status  Destination   Status_freq
##    <chr>    <chr>   <chr>               <int>
##  1 ALASKA   on time Los Angeles           497
##  2 ALASKA   on time Phoenix               221
##  3 ALASKA   on time San Diego             212
##  4 ALASKA   on time San Frnacisco         503
##  5 ALASKA   on time Seattle              1840
##  6 ALASKA   delayed Los Angeles            62
##  7 ALASKA   delayed Phoenix                12
##  8 ALASKA   delayed San Diego              20
##  9 ALASKA   delayed San Frnacisco         102
## 10 ALASKA   delayed Seattle               305
## 11 AM WEST  on time Los Angeles           694
## 12 AM WEST  on time Phoenix              4840
## 13 AM WEST  on time San Diego             383
## 14 AM WEST  on time San Frnacisco         320
## 15 AM WEST  on time Seattle               201
## 16 AM WEST  delayed Los Angeles           117
## 17 AM WEST  delayed Phoenix               415
## 18 AM WEST  delayed San Diego              65
## 19 AM WEST  delayed San Frnacisco         129
## 20 AM WEST  delayed Seattle                61

Analyzing the data:

ALASKA_status <- Air_delays_long |> 
  filter(airlines == 'ALASKA')
ALASKA_status
## # A tibble: 10 × 4
##    airlines Status  Destination   Status_freq
##    <chr>    <chr>   <chr>               <int>
##  1 ALASKA   on time Los Angeles           497
##  2 ALASKA   on time Phoenix               221
##  3 ALASKA   on time San Diego             212
##  4 ALASKA   on time San Frnacisco         503
##  5 ALASKA   on time Seattle              1840
##  6 ALASKA   delayed Los Angeles            62
##  7 ALASKA   delayed Phoenix                12
##  8 ALASKA   delayed San Diego              20
##  9 ALASKA   delayed San Frnacisco         102
## 10 ALASKA   delayed Seattle               305
ggplot(ALASKA_status, aes(x = Status_freq, color = Status)) +
  geom_density(linewidth = 0.75)

AMWEST_status <- Air_delays_long |> 
  filter(airlines == 'AM WEST')
AMWEST_status
## # A tibble: 10 × 4
##    airlines Status  Destination   Status_freq
##    <chr>    <chr>   <chr>               <int>
##  1 AM WEST  on time Los Angeles           694
##  2 AM WEST  on time Phoenix              4840
##  3 AM WEST  on time San Diego             383
##  4 AM WEST  on time San Frnacisco         320
##  5 AM WEST  on time Seattle               201
##  6 AM WEST  delayed Los Angeles           117
##  7 AM WEST  delayed Phoenix               415
##  8 AM WEST  delayed San Diego              65
##  9 AM WEST  delayed San Frnacisco         129
## 10 AM WEST  delayed Seattle                61
ggplot(AMWEST_status, aes(x = Status_freq, color = Status)) +
  geom_density(linewidth = 0.75)

Airlines_df <-
  Air_delays_long %>% 
    group_by(airlines) %>%
      summarise(
        Total_flights = sum(Status_freq),
        Total_delays = sum(ifelse(Status=='delayed', Status_freq, 0))
      )

Airlines_df <- 
  Airlines_df %>%
    mutate(
        Total_on_time = Total_flights - Total_delays,
        delay_percent = round((Total_delays / Total_flights) *100,2),
        on_time_percent = round(((Total_flights - Total_delays) / Total_flights)*100,2)
    ) %>%
      arrange(delay_percent)
Airlines_df
## # A tibble: 2 × 6
##   airlines Total_flights Total_delays Total_on_time delay_percent on_time_perc…¹
##   <chr>            <int>        <dbl>         <dbl>         <dbl>          <dbl>
## 1 AM WEST           7225          787          6438          10.9           89.1
## 2 ALASKA            3774          501          3273          13.3           86.7
## # … with abbreviated variable name ¹​on_time_percent

Both airlines have less than 15% of delayed flights; Alaska has 3% difference of delays compared to AM West even thoug Am West airlines have more flights and more number of delayed flights. Meaning that AM West has more on time flights than Alaska.

Now let’s see destination wise:

Destination_df <-
  Air_delays_long %>% 
    group_by(Destination) %>%
      summarise(
        Total_flights = sum(Status_freq),
        Total_delays = sum(ifelse(Status=='delayed', Status_freq, 0))
      )

Destination_df <- 
  Destination_df %>%
    mutate(
        Total_on_time = Total_flights - Total_delays,
        delay_percent = round((Total_delays / Total_flights) *100,2),
        on_time_percent = round(((Total_flights - Total_delays) / Total_flights)*100,2)
    ) %>%
      arrange(delay_percent)
Destination_df
## # A tibble: 5 × 6
##   Destination   Total_flights Total_delays Total_on_time delay_percent on_time…¹
##   <chr>                 <int>        <dbl>         <dbl>         <dbl>     <dbl>
## 1 Phoenix                5488          427          5061          7.78      92.2
## 2 San Diego               680           85           595         12.5       87.5
## 3 Los Angeles            1370          179          1191         13.1       86.9
## 4 Seattle                2407          366          2041         15.2       84.8
## 5 San Frnacisco          1054          231           823         21.9       78.1
## # … with abbreviated variable name ¹​on_time_percent

Based on the table above, Phoenix tends to have more percentage (92%) of on_time flights, followed by San Diego. On the other hand, San Francisco seems to have more percentage of delayed flights.

Converting the data into .CSV files

write.csv(Air_delays_long, file = "Air_delays_long.csv")
write.csv(Airlines_df, file = "Airlines_df.csv")
write.csv(Destination_df, file = "Destination_df.csv")