Read the file in from github using read.csv

csvFile <- "https://raw.githubusercontent.com/sadia-perveen/DATA607-Assignment-5/master/ArrivalDelays.csv"
aD <- readr::read_csv(csvFile)
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   `Los Angeles` = col_double(),
##   Phoenix = col_number(),
##   `San Diego` = col_double(),
##   `San Francisco` = col_double(),
##   Seattle = col_number()
## )
aD
## # A tibble: 5 x 7
##   X1      X2      `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM WEST on time           694    4840         383             320     201
## 5 <NA>    delayed           117     415          65             129      61

Clean up the data: 1. Assign proper columns names from column 1 and 2. 2. Get rid of the empty rows. 3. Populate the proper airlines in row 2 and 4.

names(aD)[1] <- "Airline"
names(aD)[2] <- "Status"
aD
## # A tibble: 5 x 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM WEST on time           694    4840         383             320     201
## 5 <NA>    delayed           117     415          65             129      61
aD <- aD[rowSums(is.na(aD)) != ncol(aD), ]
aD
## # A tibble: 4 x 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 <NA>    delayed           117     415          65             129      61
aD$Airline[2] <- "ALASKA"
aD$Airline[4] <- "AM WEST"
aD
## # A tibble: 4 x 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 AM WEST delayed           117     415          65             129      61

Use the tidyr pivot_longer function to conver data from wide to long. The follow code will take our data aD, and convert all variables except Airline and Status to rows.

aD <- pivot_longer(aD, -c("Airline", "Status"), names_to = "Location", values_to = "NumFlights")
aD
## # A tibble: 20 x 4
##    Airline Status  Location      NumFlights
##    <chr>   <chr>   <chr>              <dbl>
##  1 ALASKA  on time Los Angeles          497
##  2 ALASKA  on time Phoenix              221
##  3 ALASKA  on time San Diego            212
##  4 ALASKA  on time San Francisco        503
##  5 ALASKA  on time Seattle             1841
##  6 ALASKA  delayed Los Angeles           62
##  7 ALASKA  delayed Phoenix               12
##  8 ALASKA  delayed San Diego             20
##  9 ALASKA  delayed San Francisco        102
## 10 ALASKA  delayed Seattle              305
## 11 AM WEST on time Los Angeles          694
## 12 AM WEST on time Phoenix             4840
## 13 AM WEST on time San Diego            383
## 14 AM WEST on time San Francisco        320
## 15 AM WEST on time Seattle              201
## 16 AM WEST delayed Los Angeles          117
## 17 AM WEST delayed Phoenix              415
## 18 AM WEST delayed San Diego             65
## 19 AM WEST delayed San Francisco        129
## 20 AM WEST delayed Seattle               61

Now we can start doing some data comparison.

  1. We group our data aD by Airline and Status, summarising the total number of flight, by summing the NumFlights column based on the grouping.
  2. Then we spread out our data, based on Status and total flights for a more asthetic look, this will also allow for us to easily get the total delayed and total on time percentatges for our airlines. 3.We convert our delayed and on time columns as numericals to conduct arithmetic operations.
  3. Finally we get the total onTimePercantage and delayedPercentage.
  4. The final result shows us total onTime and Delayed percantage for both our Airline so we can easily compare the two.
statusTotals <- data.frame(aD %>% group_by(Airline, Status) %>% summarise(totalFlights = sum(NumFlights)))

statusTotals <- statusTotals %>% spread(Status, totalFlights)

statusTotals$delayed <- as.numeric(statusTotals$delayed)

statusTotals$`on time` <- as.numeric(statusTotals$`on time`)

statusTotals$onTimePercentage <- statusTotals$`on time`/(statusTotals$`on time` + statusTotals$delayed) * 100
statusTotals$delayedPercentage <- statusTotals$delayed/(statusTotals$`on time` + statusTotals$delayed) * 100

statusTotals
##   Airline delayed on time onTimePercentage delayedPercentage
## 1  ALASKA     501    3274         86.72848          13.27152
## 2 AM WEST     787    6438         89.10727          10.89273