##install.packages("tidyverse")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(dplyr)
library(ggplot2)
library(stringr)
##Data structure
arrival.delays <- read.csv("https://raw.githubusercontent.com/doradu8030/Data607/master/Arrival%20delays%20by%20Airline.csv", stringsAsFactor = FALSE)
arrival.delays
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Tyding the Data and Manipulating string

This chunk is replacing the character " " to “_" into the word “on time” Adding the airline name to the two rows whitout it removing the empty row

arrival.delays[2,1] <- arrival.delays[1,1]
arrival.delays[5,1] <- arrival.delays[4,1]
arrival.delays[,2] <- sapply(arrival.delays[,2], str_replace, " ", "_")

## *removing the empty row*
arrival.delays <- na.omit(arrival.delays) 
arrival.delays
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on_time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on_time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

Naming Columns

colnames(arrival.delays)[1] <- "Airline"
colnames(arrival.delays)[2] <- "Status"
arrival.delays
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on_time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on_time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

Melting the data : List of columns from 3 to 7 that fall into a variable “City”

draft.table <- gather(arrival.delays, "City", "Count", 3:7)
draft.table$City <- str_replace(draft.table$City, "[.]", " ")
draft.table
##    Airline  Status          City Count
## 1   ALASKA on_time   Los Angeles   497
## 2   ALASKA delayed   Los Angeles    62
## 3  AM WEST on_time   Los Angeles   694
## 4  AM WEST delayed   Los Angeles   117
## 5   ALASKA on_time       Phoenix   221
## 6   ALASKA delayed       Phoenix    12
## 7  AM WEST on_time       Phoenix  4840
## 8  AM WEST delayed       Phoenix   415
## 9   ALASKA on_time     San Diego   212
## 10  ALASKA delayed     San Diego    20
## 11 AM WEST on_time     San Diego   383
## 12 AM WEST delayed     San Diego    65
## 13  ALASKA on_time San Francisco   503
## 14  ALASKA delayed San Francisco   102
## 15 AM WEST on_time San Francisco   320
## 16 AM WEST delayed San Francisco   129
## 17  ALASKA on_time       Seattle  1841
## 18  ALASKA delayed       Seattle   305
## 19 AM WEST on_time       Seattle   201
## 20 AM WEST delayed       Seattle    61

Analysis

Total flights delayed group by Airline and City

f.arrival.delayed <- spread(draft.table, Status, Count)
totaldelayed <- f.arrival.delayed %>%
  group_by(Airline, City) %>% 
  summarise(fligh.delay = sum(delayed))
totaldelayed
## # A tibble: 10 x 3
## # Groups:   Airline [?]
##    Airline          City fligh.delay
##      <chr>         <chr>       <int>
##  1  ALASKA   Los Angeles          62
##  2  ALASKA       Phoenix          12
##  3  ALASKA     San Diego          20
##  4  ALASKA San Francisco         102
##  5  ALASKA       Seattle         305
##  6 AM WEST   Los Angeles         117
##  7 AM WEST       Phoenix         415
##  8 AM WEST     San Diego          65
##  9 AM WEST San Francisco         129
## 10 AM WEST       Seattle          61

Total flights on-time group by Airline and City

f.arrival.ontime <- spread(draft.table, Status, Count)
totalontime <- f.arrival.delayed %>%
  group_by(Airline, City) %>% 
  summarise(flight.ontime = sum(on_time))
totalontime
## # A tibble: 10 x 3
## # Groups:   Airline [?]
##    Airline          City flight.ontime
##      <chr>         <chr>         <int>
##  1  ALASKA   Los Angeles           497
##  2  ALASKA       Phoenix           221
##  3  ALASKA     San Diego           212
##  4  ALASKA San Francisco           503
##  5  ALASKA       Seattle          1841
##  6 AM WEST   Los Angeles           694
##  7 AM WEST       Phoenix          4840
##  8 AM WEST     San Diego           383
##  9 AM WEST San Francisco           320
## 10 AM WEST       Seattle           201

Total of flights delayed and on-time by Airline & shows What percentage of flights are delayed by Airline What percentage of flights are on-time by Airline

totalstatus <- f.arrival.delayed %>% 
  group_by(Airline) %>%
  summarise(tot_delayed=sum(delayed),tot_on_time=sum(on_time),Perc_delay=round((tot_delayed/(tot_delayed+tot_on_time))*100, 2),Perc_on_time=round((tot_on_time/(tot_delayed+tot_on_time))*100, 2))
totalstatus
## # A tibble: 2 x 5
##   Airline tot_delayed tot_on_time Perc_delay Perc_on_time
##     <chr>       <int>       <int>      <dbl>        <dbl>
## 1  ALASKA         501        3274      13.27        86.73
## 2 AM WEST         787        6438      10.89        89.11