##install.packages("tidyverse")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(dplyr)
library(ggplot2)
library(stringr)
##Data structure
arrival.delays <- read.csv("https://raw.githubusercontent.com/doradu8030/Data607/master/Arrival%20delays%20by%20Airline.csv", stringsAsFactor = FALSE)
arrival.delays
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
This chunk is replacing the character " " to “_" into the word “on time” Adding the airline name to the two rows whitout it removing the empty row
arrival.delays[2,1] <- arrival.delays[1,1]
arrival.delays[5,1] <- arrival.delays[4,1]
arrival.delays[,2] <- sapply(arrival.delays[,2], str_replace, " ", "_")
## *removing the empty row*
arrival.delays <- na.omit(arrival.delays)
arrival.delays
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on_time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
colnames(arrival.delays)[1] <- "Airline"
colnames(arrival.delays)[2] <- "Status"
arrival.delays
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on_time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
draft.table <- gather(arrival.delays, "City", "Count", 3:7)
draft.table$City <- str_replace(draft.table$City, "[.]", " ")
draft.table
## Airline Status City Count
## 1 ALASKA on_time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on_time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on_time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on_time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on_time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on_time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on_time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on_time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on_time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on_time Seattle 201
## 20 AM WEST delayed Seattle 61
Total flights delayed group by Airline and City
f.arrival.delayed <- spread(draft.table, Status, Count)
totaldelayed <- f.arrival.delayed %>%
group_by(Airline, City) %>%
summarise(fligh.delay = sum(delayed))
totaldelayed
## # A tibble: 10 x 3
## # Groups: Airline [?]
## Airline City fligh.delay
## <chr> <chr> <int>
## 1 ALASKA Los Angeles 62
## 2 ALASKA Phoenix 12
## 3 ALASKA San Diego 20
## 4 ALASKA San Francisco 102
## 5 ALASKA Seattle 305
## 6 AM WEST Los Angeles 117
## 7 AM WEST Phoenix 415
## 8 AM WEST San Diego 65
## 9 AM WEST San Francisco 129
## 10 AM WEST Seattle 61
Total flights on-time group by Airline and City
f.arrival.ontime <- spread(draft.table, Status, Count)
totalontime <- f.arrival.delayed %>%
group_by(Airline, City) %>%
summarise(flight.ontime = sum(on_time))
totalontime
## # A tibble: 10 x 3
## # Groups: Airline [?]
## Airline City flight.ontime
## <chr> <chr> <int>
## 1 ALASKA Los Angeles 497
## 2 ALASKA Phoenix 221
## 3 ALASKA San Diego 212
## 4 ALASKA San Francisco 503
## 5 ALASKA Seattle 1841
## 6 AM WEST Los Angeles 694
## 7 AM WEST Phoenix 4840
## 8 AM WEST San Diego 383
## 9 AM WEST San Francisco 320
## 10 AM WEST Seattle 201
Total of flights delayed and on-time by Airline & shows What percentage of flights are delayed by Airline What percentage of flights are on-time by Airline
totalstatus <- f.arrival.delayed %>%
group_by(Airline) %>%
summarise(tot_delayed=sum(delayed),tot_on_time=sum(on_time),Perc_delay=round((tot_delayed/(tot_delayed+tot_on_time))*100, 2),Perc_on_time=round((tot_on_time/(tot_delayed+tot_on_time))*100, 2))
totalstatus
## # A tibble: 2 x 5
## Airline tot_delayed tot_on_time Perc_delay Perc_on_time
## <chr> <int> <int> <dbl> <dbl>
## 1 ALASKA 501 3274 13.27 86.73
## 2 AM WEST 787 6438 10.89 89.11