Read the file in from github using read.csv
csvFile <- "https://raw.githubusercontent.com/sadia-perveen/DATA607-Assignment-5/master/ArrivalDelays.csv"
aD <- readr::read_csv(csvFile)
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_number(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_number()
## )
aD
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Clean up the data: 1. Assign proper columns names from column 1 and 2. 2. Get rid of the empty rows. 3. Populate the proper airlines in row 2 and 4.
names(aD)[1] <- "Airline"
names(aD)[2] <- "Status"
aD
## # A tibble: 5 x 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
aD <- aD[rowSums(is.na(aD)) != ncol(aD), ]
aD
## # A tibble: 4 x 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
aD$Airline[2] <- "ALASKA"
aD$Airline[4] <- "AM WEST"
aD
## # A tibble: 4 x 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Use the tidyr pivot_longer function to conver data from wide to long. The follow code will take our data aD, and convert all variables except Airline and Status to rows.
aD <- pivot_longer(aD, -c("Airline", "Status"), names_to = "Location", values_to = "NumFlights")
aD
## # A tibble: 20 x 4
## Airline Status Location NumFlights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
Now we can start doing some data comparison.
statusTotals <- data.frame(aD %>% group_by(Airline, Status) %>% summarise(totalFlights = sum(NumFlights)))
statusTotals <- statusTotals %>% spread(Status, totalFlights)
statusTotals$delayed <- as.numeric(statusTotals$delayed)
statusTotals$`on time` <- as.numeric(statusTotals$`on time`)
statusTotals$onTimePercentage <- statusTotals$`on time`/(statusTotals$`on time` + statusTotals$delayed) * 100
statusTotals$delayedPercentage <- statusTotals$delayed/(statusTotals$`on time` + statusTotals$delayed) * 100
statusTotals
## Airline delayed on time onTimePercentage delayedPercentage
## 1 ALASKA 501 3274 86.72848 13.27152
## 2 AM WEST 787 6438 89.10727 10.89273