library(tidyr)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)airlines <- read.csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/flights.csv", stringsAsFactors = FALSE)
airlines## Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on_time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 am west on_time 694 4840 383 320 201
## 4 am west delayed 117 415 65 129 61
Data wrangling added new columns: total_flights, percent_ontime, and percent_delayed
airline_dt <-airlines
airline_df <- airline_dt %>%
gather("loc","flights ", 3:7) %>%
spread(Status, "flights ") %>%
mutate(total_flights = delayed + on_time) %>%
mutate(percent_ontime = on_time / total_flights ) %>%
mutate(percent_delayed = delayed / total_flights)
airline_df## Carrier loc delayed on_time total_flights percent_ontime
## 1 Alaska Los.Angeles 62 497 559 0.8890877
## 2 Alaska Phoenix 12 221 233 0.9484979
## 3 Alaska San.Diego 20 212 232 0.9137931
## 4 Alaska San.Francisco 102 503 605 0.8314050
## 5 Alaska Seattle 305 1841 2146 0.8578751
## 6 am west Los.Angeles 117 694 811 0.8557337
## 7 am west Phoenix 415 4840 5255 0.9210276
## 8 am west San.Diego 65 383 448 0.8549107
## 9 am west San.Francisco 129 320 449 0.7126949
## 10 am west Seattle 61 201 262 0.7671756
## percent_delayed
## 1 0.11091234
## 2 0.05150215
## 3 0.08620690
## 4 0.16859504
## 5 0.14212488
## 6 0.14426634
## 7 0.07897241
## 8 0.14508929
## 9 0.28730512
## 10 0.23282443
airline_df %>%
group_by(Carrier)%>%
summarize(total_flights = sum(total_flights))## # A tibble: 2 x 2
## Carrier total_flights
## <chr> <int>
## 1 Alaska 3775
## 2 am west 7225
Now let us see which airline and city has the highest percentage of on-time flights
airline_df %>%
filter(percent_ontime== max(percent_ontime))## Carrier loc delayed on_time total_flights percent_ontime
## 1 Alaska Phoenix 12 221 233 0.9484979
## percent_delayed
## 1 0.05150215
Alaska airline with flights from Phoenix had the highest on-time percentage from looking at the entire dataset. Percent of 0.95
Now we are going to check which airline and city has the worst percentage of on-time flights
airline_df %>%
filter(percent_delayed== max(percent_delayed))## Carrier loc delayed on_time total_flights percent_ontime
## 1 am west San.Francisco 129 320 449 0.7126949
## percent_delayed
## 1 0.2873051
AM West airlines with flights from San Francisco had the most delayed percentage from looking at the entire dataset. Percent of 0.28 and are on-time 0.71 percent.
Visualization
ggplot(airline_df, aes(airline_df$on_time, airline_df$delayed)) +
geom_point(aes(color = airline_df$Carrier)) +
ggtitle("Num of flights on-time vs. delayed for both Airlines") +
xlab("On-time flights") +
ylab("Delayed flights") +
theme_dark()