NYCFlights

Load the library:

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)

Load the dataset:

data(flights)
summary(flights)
      year          month             day           dep_time     sched_dep_time
 Min.   :2023   Min.   : 1.000   Min.   : 1.00   Min.   :   1    Min.   : 500  
 1st Qu.:2023   1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 931    1st Qu.: 930  
 Median :2023   Median : 6.000   Median :16.00   Median :1357    Median :1359  
 Mean   :2023   Mean   : 6.423   Mean   :15.74   Mean   :1366    Mean   :1364  
 3rd Qu.:2023   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:1804    3rd Qu.:1759  
 Max.   :2023   Max.   :12.000   Max.   :31.00   Max.   :2400    Max.   :2359  
                                                 NA's   :10738                 
   dep_delay          arr_time     sched_arr_time   arr_delay       
 Min.   : -50.00   Min.   :   1    Min.   :   1   Min.   : -97.000  
 1st Qu.:  -6.00   1st Qu.:1105    1st Qu.:1135   1st Qu.: -22.000  
 Median :  -2.00   Median :1519    Median :1551   Median : -10.000  
 Mean   :  13.84   Mean   :1497    Mean   :1552   Mean   :   4.345  
 3rd Qu.:  10.00   3rd Qu.:1946    3rd Qu.:2007   3rd Qu.:   9.000  
 Max.   :1813.00   Max.   :2400    Max.   :2359   Max.   :1812.000  
 NA's   :10738     NA's   :11453                  NA's   :12534     
   carrier              flight         tailnum             origin         
 Length:435352      Min.   :   1.0   Length:435352      Length:435352     
 Class :character   1st Qu.: 364.0   Class :character   Class :character  
 Mode  :character   Median : 734.0   Mode  :character   Mode  :character  
                    Mean   : 785.2                                        
                    3rd Qu.:1188.0                                        
                    Max.   :1972.0                                        
                                                                          
     dest              air_time        distance           hour      
 Length:435352      Min.   : 18.0   Min.   :  80.0   Min.   : 5.00  
 Class :character   1st Qu.: 77.0   1st Qu.: 479.0   1st Qu.: 9.00  
 Mode  :character   Median :121.0   Median : 762.0   Median :13.00  
                    Mean   :141.8   Mean   : 977.5   Mean   :13.35  
                    3rd Qu.:177.0   3rd Qu.:1182.0   3rd Qu.:17.00  
                    Max.   :701.0   Max.   :4983.0   Max.   :23.00  
                    NA's   :12534                                   
     minute        time_hour                     
 Min.   : 0.00   Min.   :2023-01-01 05:00:00.00  
 1st Qu.:10.00   1st Qu.:2023-03-30 20:00:00.00  
 Median :29.00   Median :2023-06-27 08:00:00.00  
 Mean   :28.53   Mean   :2023-06-29 10:02:22.39  
 3rd Qu.:45.00   3rd Qu.:2023-09-27 11:00:00.00  
 Max.   :59.00   Max.   :2023-12-31 23:00:00.00  
                                                 

select variables:

flights1 <- flights |>
  select(dep_delay, month, flight, carrier, distance, origin, dest) |>
  filter(origin=="JFK") |>
 filter(!is.na(distance))
head(flights1)
# A tibble: 6 × 7
  dep_delay month flight carrier distance origin dest 
      <dbl> <int>  <int> <chr>      <dbl> <chr>  <chr>
1        78     1    393 DL           760 JFK    ATL  
2        47     1    371 B6          1576 JFK    BQN  
3       173     1   1053 B6           636 JFK    CHS  
4        10     1    996 B6          1576 JFK    BQN  
5       -10     1    800 B6          1028 JFK    PBI  
6        -6     1    165 AA          1089 JFK    MIA  
plot1 <-flights1|>
  ggplot() +
  geom_point(aes(x=distance, y=dep_delay, color = carrier,alpha=0.2)) + 
  labs(color= "Airline",
       x= "Flight Distance",
       y = "Departure Delay",
       title = "Comparison of Flight Distance to Departure Delay by Carrier",
       caption = "Source: FAA Aircraft registry")
plot1
Warning: Removed 2848 rows containing missing values or values outside the scale range
(`geom_point()`).

flights1$month[flights1$month == 1]<- "Jan"
flights1$month[flights1$month == 2]<- "Feb"
flights1$month[flights1$month == 3]<- "March"
flights1$month[flights1$month == 4]<- "April"
flights1$month[flights1$month == 5]<- "May"
flights1$month[flights1$month == 6]<- "June"
flights1$month[flights1$month == 7]<- "July"
flights1$month[flights1$month == 8]<- "Aug"
flights1$month[flights1$month == 9]<- "Sept"
flights1$month[flights1$month == 10]<- "Oct"
flights1$month[flights1$month == 11]<- "Nov"
flights1$month[flights1$month == 12]<- "Dec"
flights1$carrier[flights1$carrier == "YX"]<- "Republic Airlines"
flights1$carrier[flights1$carrier == "OO"]<- "Sky West Airlines"
flights1$carrier[flights1$carrier == "HA"]<-"Hawaiian Airline"
flights1$carrier[flights1$carrier == "DL"]<-"Delta Airlines"
flights1$carrier[flights1$carrier == "B6"]<- "Jet Blue"
flights1$carrier[flights1$carrier == "AS"]<- "Alaska Airlines"
flights1$carrier[flights1$carrier == "AA"]<-"American Airlines"
flights1$carrier[flights1$carrier == "9E"]<-"Endeavor Air"
ggplot(data = flights1, aes(x=month, y=carrier, fill = distance)) +
  geom_tile()+
  scale_fill_distiller(palette="BuPu") +
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90)) +
  labs( title = "Flight Distance by Airline Carrier",
        caption = "Source: FAA Aircraft registry",
        x="Month", y="Airline Carrier")

Essay:

I created a heat map that compares the distances each Airline carrier flies by month. There are only eight airlines because I narrowed the scale to airlines running out of the JFK airport. This helped focus on the patterns without too many variables. This heat map shows that the month does not indicate significant variation in flight distance, however, the airlines are consistantly different in their flight distances. I would like to highlight that Hawaiian Airline runs flights with the greatest distances by far, and one of the airlines with the closest flights is Endeavor Airline. This makes sense because Hawaiian Airlines must travel a great deal even to reach land (as it is an island). I began creating the visualization by comparing the delay with the distance, but the comparison did not show strong relationships, so I decided to try out a different direction.