NYC Flights Homework

Load the libraries and view the “flights” dataset

#install.packages("nycflights13)
library(nycflights13)
flights <- flights 
head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
dim(flights)
## [1] 336776     19
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
flight_sum <- flights %>% 
  #group flight cancellation and flight delay into one level
  mutate(delay = ifelse(dep_delay >= 15 | is.na(dep_delay) == FALSE, 1, 1),
         carrier = factor(carrier)) %>%
  #select relevant variables and save to a new data table
  select(delay, year, month, day, carrier, distance, hour, time_hour)
head(flight_sum)
## # A tibble: 6 × 8
##   delay  year month   day carrier distance  hour time_hour          
##   <dbl> <int> <int> <int> <fct>      <dbl> <dbl> <dttm>             
## 1     1  2013     1     1 UA          1400     5 2013-01-01 05:00:00
## 2     1  2013     1     1 UA          1416     5 2013-01-01 05:00:00
## 3     1  2013     1     1 AA          1089     5 2013-01-01 05:00:00
## 4     1  2013     1     1 B6          1576     5 2013-01-01 05:00:00
## 5     1  2013     1     1 DL           762     6 2013-01-01 06:00:00
## 6     1  2013     1     1 UA           719     5 2013-01-01 05:00:00
#Correlation between departure delay and arrival delay, excluding cancelled flights
cor(flights[c("dep_delay", "arr_delay")],use = "pairwise.complete.obs")
##           dep_delay arr_delay
## dep_delay 1.0000000 0.9148028
## arr_delay 0.9148028 1.0000000
pairs(flights[c("dep_delay", "arr_delay")])

#Proportion of flight delays and cancellations in this dataset
round(table(flight_sum$delay)/nrow(flight_sum),3)
## 
##     1 
## 0.975
#Proportion of flight delays by airlines
flight_sum %>% group_by(carrier) %>% summarize(prop.delay = mean(delay==1)) %>% arrange(desc(prop.delay)) %>% left_join(airlines, by = "carrier")
## # A tibble: 16 × 3
##    carrier prop.delay name                       
##    <chr>        <dbl> <chr>                      
##  1 HA               1 Hawaiian Airlines Inc.     
##  2 9E              NA Endeavor Air Inc.          
##  3 AA              NA American Airlines Inc.     
##  4 AS              NA Alaska Airlines Inc.       
##  5 B6              NA JetBlue Airways            
##  6 DL              NA Delta Air Lines Inc.       
##  7 EV              NA ExpressJet Airlines Inc.   
##  8 F9              NA Frontier Airlines Inc.     
##  9 FL              NA AirTran Airways Corporation
## 10 MQ              NA Envoy Air                  
## 11 OO              NA SkyWest Airlines Inc.      
## 12 UA              NA United Air Lines Inc.      
## 13 US              NA US Airways Inc.            
## 14 VX              NA Virgin America             
## 15 WN              NA Southwest Airlines Co.     
## 16 YV              NA Mesa Airlines Inc.
flight_airline <- left_join(flights, airlines, by= "carrier")
#Number of flights by different airlines
flight_airline %>% mutate(delay_group = case_when(dep_delay <15 ~ "on-time", dep_delay >=15 ~ "delayed", is.na(dep_delay) == TRUE ~ "cancelled")) %>%
  ggplot(aes(x = name, fill = delay_group)) +
  geom_bar(stat = "count", position = "dodge") +
  coord_flip() +
  theme(legend.position = "top") +
  scale_fill_manual(values = c("on-time" = "green", "delayed" = "yellow", "cancelled" = "red")) +
  xlab("Airline Names") +
  ylab("Flight Count") +
  guides(fill=guide_legend(title="Arrival Time"))

Write a brief paragraph that describes the visualization you have created and at least one aspect of the plot that you would like to highlight.

For this New York City Flight 2013 data visualization heading displayed the first 6 rows of data. Dimensioning the singular data set, of 33,6776 observations out of 19 variables. Using the package lubridate flights were summarised, piped into group flight cancellation and flight delay into one level. Mutated by delay status ifelse was used for departure delay anything more than 15 plus minutes. Anything that contained N/A, meaning cancel flight. Carrier was factored into the script. I used stop light colors for canceled, delayed, and on-time because they are universal; except for color-blinded folks. Resulting in the finding that United Airline had the most on-time arrivals and flight count. AirTran Airways Corporation was the least amount of flight counts on-time or otherwise.