title: “Assignment5” author: “Anna Moy” date: “2024-02-25” output: html_document

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

Read the csv file for airlines

airline <- read_csv("https://raw.githubusercontent.com/AnnaMoy/Data-607/main/airline.csv")

airline
## # A tibble: 5 × 7
##   ...1    ...2    `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM WEST on time           694    4840         383             320     201
## 5 <NA>    delayed           117     415          65             129      61

Remove the blank line on data

airline <- airline[-3,]

Fill in blank data for airline

airline %>%
  fill('...1', .direction = "down")
## # A tibble: 4 × 7
##   ...1    ...2    `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 AM WEST delayed           117     415          65             129      61

Transpose the data and rename column title

airline <- airline %>%
  fill('...1', .direction = "down") %>%
  pivot_longer(-c('...1', '...2'), names_to = "location") %>%
  pivot_wider(names_from ='...2', values_from = value) %>%
  rename("airline"= '...1', "on_time" = 'on time')
airline
## # A tibble: 10 × 4
##    airline location      on_time delayed
##    <chr>   <chr>           <dbl>   <dbl>
##  1 ALASKA  Los Angeles       497      62
##  2 ALASKA  Phoenix           221      12
##  3 ALASKA  San Diego         212      20
##  4 ALASKA  San Francisco     503     102
##  5 ALASKA  Seattle          1841     305
##  6 AM WEST Los Angeles       694     117
##  7 AM WEST Phoenix          4840     415
##  8 AM WEST San Diego         383      65
##  9 AM WEST San Francisco     320     129
## 10 AM WEST Seattle           201      61

Find the average and median for each airline delays and on time

airline %>%
  group_by(airline) %>%
  select (airline, on_time, delayed) %>%
  summarize(meandelay = mean(delayed), mediandelay = median(delayed), meanontime = mean(on_time), medianontime = median(on_time))
## # A tibble: 2 × 5
##   airline meandelay mediandelay meanontime medianontime
##   <chr>       <dbl>       <dbl>      <dbl>        <dbl>
## 1 ALASKA       100.          62       655.          497
## 2 AM WEST      157.         117      1288.          383

Find the percent delay for each airline by location in descending order

flight <- airline %>%
  group_by(airline) %>%
  mutate(total_delay = sum(delayed), total_ontime = sum(on_time),all_flight = total_delay + total_ontime, percent_rate = delayed/all_flight, percent_delay = (delayed/all_flight)) %>%
  arrange(airline, desc(percent_delay))
flight
## # A tibble: 10 × 9
## # Groups:   airline [2]
##    airline location      on_time delayed total_delay total_ontime all_flight
##    <chr>   <chr>           <dbl>   <dbl>       <dbl>        <dbl>      <dbl>
##  1 ALASKA  Seattle          1841     305         501         3274       3775
##  2 ALASKA  San Francisco     503     102         501         3274       3775
##  3 ALASKA  Los Angeles       497      62         501         3274       3775
##  4 ALASKA  San Diego         212      20         501         3274       3775
##  5 ALASKA  Phoenix           221      12         501         3274       3775
##  6 AM WEST Phoenix          4840     415         787         6438       7225
##  7 AM WEST San Francisco     320     129         787         6438       7225
##  8 AM WEST Los Angeles       694     117         787         6438       7225
##  9 AM WEST San Diego         383      65         787         6438       7225
## 10 AM WEST Seattle           201      61         787         6438       7225
## # ℹ 2 more variables: percent_rate <dbl>, percent_delay <dbl>

Total flights for Alaska and AM West

total <- airline %>%
  group_by(airline) %>%
  summarize(sum = sum(delayed)) 

total
## # A tibble: 2 × 2
##   airline   sum
##   <chr>   <dbl>
## 1 ALASKA    501
## 2 AM WEST   787

flights by location

Summary of each airline on time and delay data

airline %>%
  filter(airline == "ALASKA") %>%
  summary(airline)
##    airline            location            on_time          delayed     
##  Length:5           Length:5           Min.   : 212.0   Min.   : 12.0  
##  Class :character   Class :character   1st Qu.: 221.0   1st Qu.: 20.0  
##  Mode  :character   Mode  :character   Median : 497.0   Median : 62.0  
##                                        Mean   : 654.8   Mean   :100.2  
##                                        3rd Qu.: 503.0   3rd Qu.:102.0  
##                                        Max.   :1841.0   Max.   :305.0
airline %>%
  filter(airline == "AM WEST") %>%
  summary(airline)
##    airline            location            on_time        delayed     
##  Length:5           Length:5           Min.   : 201   Min.   : 61.0  
##  Class :character   Class :character   1st Qu.: 320   1st Qu.: 65.0  
##  Mode  :character   Mode  :character   Median : 383   Median :117.0  
##                                        Mean   :1288   Mean   :157.4  
##                                        3rd Qu.: 694   3rd Qu.:129.0  
##                                        Max.   :4840   Max.   :415.0

Plot delay by location for each airline

ggplot(flight, aes(x = location, y = percent_delay, fill = airline)) +
  geom_bar(stat = "identity", position = "dodge")

ggplot(total, aes(x = airline, sum, fill = airline)) +
  geom_bar(stat = "identity", position = "dodge")

ggplot(flight, aes(x = airline, percent_rate, fill = airline)) +
  geom_bar(stat = "identity", position = "dodge")

Conclusion

AM West has more delays than Alaska as the average and median are higher than Alaska. But if we look at the overall flights compared to the percentage of delay we will notice that Alaska has more delays then AM West. There was a bias if we were jsut looking at over all delayed in total without looking overall picture.