Import Data

Import data from CSV

path <- 'https://raw.githubusercontent.com/dhairavc/DATA607/master/Flight_Delay.csv'
Flight_Delays <- read.csv(path)
Flight_Delays
##         X     X.1 Los.Angeles Phonex San.Diego San.Francisco Seattle
## 1  ALASKA on time         497    221       212           503    1841
## 2         delayed          62     12        20           102     305
## 3                          NA     NA        NA            NA      NA
## 4 AM WEST on time         694   4840       383           320     201
## 5         delayed         117    415        65           129      61

Tidy Data

Tidy data to a usable point, but not in completely tidy format

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)


Flight_Delays[2,1] <- "ALASKA"
Flight_Delays[5,1] <- "AM WEST"

tidy_flights <- Flight_Delays %>% na.omit()
tidy_flights <- tidy_flights %>% gather("City", "Count", 3:7)
colnames(tidy_flights) <- c("Airline", "Status", "City", "Count")
tidy_flights$City <- str_replace(tidy_flights$City, "\\.", " ")

tidy_flights %>% arrange(Airline, City)
##    Airline  Status          City Count
## 1   ALASKA on time   Los Angeles   497
## 2   ALASKA delayed   Los Angeles    62
## 3   ALASKA on time        Phonex   221
## 4   ALASKA delayed        Phonex    12
## 5   ALASKA on time     San Diego   212
## 6   ALASKA delayed     San Diego    20
## 7   ALASKA on time San Francisco   503
## 8   ALASKA delayed San Francisco   102
## 9   ALASKA on time       Seattle  1841
## 10  ALASKA delayed       Seattle   305
## 11 AM WEST on time   Los Angeles   694
## 12 AM WEST delayed   Los Angeles   117
## 13 AM WEST on time        Phonex  4840
## 14 AM WEST delayed        Phonex   415
## 15 AM WEST on time     San Diego   383
## 16 AM WEST delayed     San Diego    65
## 17 AM WEST on time San Francisco   320
## 18 AM WEST delayed San Francisco   129
## 19 AM WEST on time       Seattle   201
## 20 AM WEST delayed       Seattle    61

Data by City

Looking at the data just by the city we notice

library(ggplot2)
library(tibble)

Total_OnTime <- tidy_flights %>% filter(Status == "on time") %>% select("City", "Count") %>% group_by(City) %>% summarise(Total_OnTime = sum(Count))

Total_Delayed <- tidy_flights %>% filter(Status == "delayed") %>% select("City", "Count") %>% group_by(City) %>% summarise(Total_Delayed = sum(Count))

City_Totals <- inner_join(Total_OnTime, Total_Delayed, by = "City")
City_Totals$Ontime_Rate <- City_Totals$Total_OnTime/(City_Totals$Total_OnTime + City_Totals$Total_Delayed)
City_Totals
## # A tibble: 5 x 4
##   City          Total_OnTime Total_Delayed Ontime_Rate
##   <chr>                <int>         <int>       <dbl>
## 1 Los Angeles           1191           179       0.869
## 2 Phonex                5061           427       0.922
## 3 San Diego              595            85       0.875
## 4 San Francisco          823           231       0.781
## 5 Seattle               2042           366       0.848
City_Totals_Sum <- City_Totals %>% gather("Status", "Count", 2:3)

ggplot(City_Totals_Sum, aes(x = City_Totals_Sum$City, y = City_Totals_Sum$Count)) + 
  geom_bar(aes(fill = City_Totals_Sum$Status), position = "dodge", stat="identity") +
  xlab("Airports") +
  ylab("Count") +
  labs(fill = "Legend")

Data By Airline

Looking at the data just by Airline, we learn

library(tibble)

tidy_flights2 <- tidy_flights %>% spread(Status, Count)
colnames(tidy_flights2) <- c("Airline", "City", "Delayed", "OnTime")


alaska <- tidy_flights2 %>% filter(Airline == "ALASKA")
amwest <- tidy_flights2 %>% filter(Airline == "AM WEST")


Airline_Totals <- rbind(alaska %>% select("Airline", "Delayed", "OnTime") %>% summarise(Airline = unique(Airline), Delayed = sum(Delayed), OnTime = sum(OnTime)), 
                        amwest %>% select("Airline", "Delayed", "OnTime") %>% summarise(Airline = unique(Airline), Delayed = sum(Delayed), OnTime = sum(OnTime))
                       )
Airline_Totals %>% gather(Status, Count, -Airline) %>%
  ggplot(aes(Airline, Count, fill = Status)) + 
           geom_col(position = "dodge", color = "black") + #adding black outline to bar
           facet_wrap(~Airline, scales = "free_x") + 
           scale_fill_manual(values = c("plum1", "slategray1")) + #adding custom colors
           theme_bw() + #plot background theme
           geom_text(aes(label = Count), position = position_dodge(.9), vjust = 1.5) #bar counts

add_column(Airline_Totals, Delay_Percent = Airline_Totals$Delayed/(rowSums(Airline_Totals[2:3])))
##   Airline Delayed OnTime Delay_Percent
## 1  ALASKA     501   3274     0.1327152
## 2 AM WEST     787   6438     0.1089273

Data by City and Airline

When comparing both Airline and City we learn that

library(dplyr)
library(tibble)
library(RColorBrewer)


tidy_flights3 <- add_column(bind_rows(alaska,amwest), Delay_Percent = tidy_flights2$Delayed/rowSums(tidy_flights2[3:4])) %>% arrange(Delay_Percent)
tidy_flights3
##    Airline          City Delayed OnTime Delay_Percent
## 1   ALASKA        Phonex      12    221    0.05150215
## 2  AM WEST        Phonex     415   4840    0.07897241
## 3   ALASKA     San Diego      20    212    0.08620690
## 4   ALASKA   Los Angeles      62    497    0.11091234
## 5   ALASKA       Seattle     305   1841    0.14212488
## 6  AM WEST   Los Angeles     117    694    0.14426634
## 7  AM WEST     San Diego      65    383    0.14508929
## 8   ALASKA San Francisco     102    503    0.16859504
## 9  AM WEST       Seattle      61    201    0.23282443
## 10 AM WEST San Francisco     129    320    0.28730512
tidy_flights3 %>% ggplot(aes(x = City, y = Delay_Percent, color=Airline)) + 
  geom_point(aes(fill = Airline), size=5) + 
  scale_color_brewer(palette = "Accent")

Conclusions

Based on the limited analysis