library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
airlines <- read.csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/flights.csv", stringsAsFactors = FALSE)

airlines
##   Carrier  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on_time         497     221       212           503    1841
## 2  Alaska delayed          62      12        20           102     305
## 3 am west on_time         694    4840       383           320     201
## 4 am west delayed         117     415        65           129      61

Data wrangling added new columns: total_flights, percent_ontime, and percent_delayed

airline_dt <-airlines
airline_df <-  airline_dt %>%
  gather("loc","flights ", 3:7) %>%
  spread(Status, "flights ") %>%
  mutate(total_flights = delayed + on_time) %>%
  mutate(percent_ontime = on_time / total_flights ) %>%
  mutate(percent_delayed = delayed / total_flights) 

airline_df
##    Carrier           loc delayed on_time total_flights percent_ontime
## 1   Alaska   Los.Angeles      62     497           559      0.8890877
## 2   Alaska       Phoenix      12     221           233      0.9484979
## 3   Alaska     San.Diego      20     212           232      0.9137931
## 4   Alaska San.Francisco     102     503           605      0.8314050
## 5   Alaska       Seattle     305    1841          2146      0.8578751
## 6  am west   Los.Angeles     117     694           811      0.8557337
## 7  am west       Phoenix     415    4840          5255      0.9210276
## 8  am west     San.Diego      65     383           448      0.8549107
## 9  am west San.Francisco     129     320           449      0.7126949
## 10 am west       Seattle      61     201           262      0.7671756
##    percent_delayed
## 1       0.11091234
## 2       0.05150215
## 3       0.08620690
## 4       0.16859504
## 5       0.14212488
## 6       0.14426634
## 7       0.07897241
## 8       0.14508929
## 9       0.28730512
## 10      0.23282443
airline_df %>%
  group_by(Carrier)%>%
  summarize(total_flights = sum(total_flights))
## # A tibble: 2 x 2
##   Carrier total_flights
##   <chr>           <int>
## 1 Alaska           3775
## 2 am west          7225

Now let us see which airline and city has the highest percentage of on-time flights

airline_df %>%
  filter(percent_ontime== max(percent_ontime))
##   Carrier     loc delayed on_time total_flights percent_ontime
## 1  Alaska Phoenix      12     221           233      0.9484979
##   percent_delayed
## 1      0.05150215

Alaska airline with flights from Phoenix had the highest on-time percentage from looking at the entire dataset. Percent of 0.95

Now we are going to check which airline and city has the worst percentage of on-time flights

airline_df %>%
  filter(percent_delayed== max(percent_delayed))
##   Carrier           loc delayed on_time total_flights percent_ontime
## 1 am west San.Francisco     129     320           449      0.7126949
##   percent_delayed
## 1       0.2873051

AM West airlines with flights from San Francisco had the most delayed percentage from looking at the entire dataset. Percent of 0.28 and are on-time 0.71 percent.

Visualization

ggplot(airline_df, aes(airline_df$on_time, airline_df$delayed)) +
  geom_point(aes(color = airline_df$Carrier)) + 
  ggtitle("Num of flights on-time vs. delayed for both Airlines") +
  xlab("On-time flights") +
  ylab("Delayed flights") +
  theme_dark()