#install.packages("nycflights13)
library(nycflights13)
flights <- flights
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
dim(flights)
## [1] 336776 19
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
flight_sum <- flights %>%
#group flight cancellation and flight delay into one level
mutate(delay = ifelse(dep_delay >= 15 | is.na(dep_delay) == FALSE, 1, 1),
carrier = factor(carrier)) %>%
#select relevant variables and save to a new data table
select(delay, year, month, day, carrier, distance, hour, time_hour)
head(flight_sum)
## # A tibble: 6 × 8
## delay year month day carrier distance hour time_hour
## <dbl> <int> <int> <int> <fct> <dbl> <dbl> <dttm>
## 1 1 2013 1 1 UA 1400 5 2013-01-01 05:00:00
## 2 1 2013 1 1 UA 1416 5 2013-01-01 05:00:00
## 3 1 2013 1 1 AA 1089 5 2013-01-01 05:00:00
## 4 1 2013 1 1 B6 1576 5 2013-01-01 05:00:00
## 5 1 2013 1 1 DL 762 6 2013-01-01 06:00:00
## 6 1 2013 1 1 UA 719 5 2013-01-01 05:00:00
#Correlation between departure delay and arrival delay, excluding cancelled flights
cor(flights[c("dep_delay", "arr_delay")],use = "pairwise.complete.obs")
## dep_delay arr_delay
## dep_delay 1.0000000 0.9148028
## arr_delay 0.9148028 1.0000000
pairs(flights[c("dep_delay", "arr_delay")])
#Proportion of flight delays and cancellations in this dataset
round(table(flight_sum$delay)/nrow(flight_sum),3)
##
## 1
## 0.975
#Proportion of flight delays by airlines
flight_sum %>% group_by(carrier) %>% summarize(prop.delay = mean(delay==1)) %>% arrange(desc(prop.delay)) %>% left_join(airlines, by = "carrier")
## # A tibble: 16 × 3
## carrier prop.delay name
## <chr> <dbl> <chr>
## 1 HA 1 Hawaiian Airlines Inc.
## 2 9E NA Endeavor Air Inc.
## 3 AA NA American Airlines Inc.
## 4 AS NA Alaska Airlines Inc.
## 5 B6 NA JetBlue Airways
## 6 DL NA Delta Air Lines Inc.
## 7 EV NA ExpressJet Airlines Inc.
## 8 F9 NA Frontier Airlines Inc.
## 9 FL NA AirTran Airways Corporation
## 10 MQ NA Envoy Air
## 11 OO NA SkyWest Airlines Inc.
## 12 UA NA United Air Lines Inc.
## 13 US NA US Airways Inc.
## 14 VX NA Virgin America
## 15 WN NA Southwest Airlines Co.
## 16 YV NA Mesa Airlines Inc.
flight_airline <- left_join(flights, airlines, by= "carrier")
#Number of flights by different airlines
flight_airline %>% mutate(delay_group = case_when(dep_delay <15 ~ "on-time", dep_delay >=15 ~ "delayed", is.na(dep_delay) == TRUE ~ "cancelled")) %>%
ggplot(aes(x = name, fill = delay_group)) +
geom_bar(stat = "count", position = "dodge") +
coord_flip() +
theme(legend.position = "top") +
scale_fill_manual(values = c("on-time" = "green", "delayed" = "yellow", "cancelled" = "red")) +
xlab("Airline Names") +
ylab("Flight Count") +
guides(fill=guide_legend(title="Arrival Time"))
For this New York City Flight 2013 data visualization heading displayed the first 6 rows of data. Dimensioning the singular data set, of 33,6776 observations out of 19 variables. Using the package lubridate flights were summarised, piped into group flight cancellation and flight delay into one level. Mutated by delay status ifelse was used for departure delay anything more than 15 plus minutes. Anything that contained N/A, meaning cancel flight. Carrier was factored into the script. I used stop light colors for canceled, delayed, and on-time because they are universal; except for color-blinded folks. Resulting in the finding that United Airline had the most on-time arrivals and flight count. AirTran Airways Corporation was the least amount of flight counts on-time or otherwise.
Yen Tran https://rpubs.com/tranyen/496061