NYC Flights HW

Author

Nandini Srinivasan

NYC Flights HW

library(tidyverse)
library(nycflights13)
library(RColorBrewer)
data(flights)

Remove observations with NA values from the following variables: sched_dep_time, dep_delay, and dest.

flights_nona <- flights |> 
  filter(!is.na(sched_dep_time) & !is.na(dep_delay) & !is.na(dest)) 

Group by destination and find the number of flights to each destination.

dest_nona <- flights_nona |>
  group_by(dest) |>
  summarise(count = n())

Find the top 10 destinations.

head(arrange(dest_nona, desc(count)), n = 10)
# A tibble: 10 × 2
   dest  count
   <chr> <int>
 1 ATL   16898
 2 ORD   16642
 3 LAX   16076
 4 BOS   15049
 5 MCO   13982
 6 CLT   13698
 7 SFO   13230
 8 FLL   11934
 9 MIA   11633
10 DCA    9157

Filter flights_nona to include only the top 10 destinations.

top10_nona <- flights_nona |>
  filter(dest == "ATL" | dest == "ORD" | dest == "LAX" | dest == "BOS" | dest == "MCO" | dest == "CLT" | dest == "SFO" | dest == "FLL" | dest == "MIA" | dest == "DCA")

Create a bar graph

plot1 <- top10_nona |>
  ggplot() +
  geom_bar(aes(x=sched_dep_time, y=dep_delay, fill = dest),
      position = "dodge", stat = "identity") +
  labs(fill = "Destination",
       y = "Delay of Departure",
       title = "NYC Departure Delays by Scheduled Departure Time",
       caption = "Source: NYCFlights13 Dataset")
plot1

This graph is hard to read since it is too cluttered. I redid the bar graph to include only the top 3 destinations: ATL, ORD, and LAX.

top3_nona <- flights_nona |>
  filter(dest == "ATL" | dest == "ORD" | dest == "LAX" & dep_delay <= 700)

plot4 <- top3_nona |>
  ggplot() +
  geom_bar(aes(x=sched_dep_time, y=dep_delay, fill = dest),
      position = "dodge", stat = "identity") +
  labs(fill = "Destination",
       x = "Scheduled Departure Time",
       y = "Delay of Departure",
       title = "Delays to Top 3 Destinations from NYC by Scheduled Departure Time",
       caption = "Source: NYCFlights13 Dataset")

plot4 + coord_cartesian(xlim = c(500, 2250)) + coord_cartesian(ylim = c(0, 650)) 
Coordinate system already present. Adding new coordinate system, which will
replace the existing one.

This visualization shows delay departures of flights to the top 3 destinations from NYC and their scheduled departure time. In general, midday and afternoon flights appear to be more delayed upon departure. This might be because there are more incoming flights as the day progresses and these planes might have to wait longer before being cleared for takeoff. There are probably fewer delays in the evening since there are fewer flights then. It is hard to tell whether there are significant differences in delays based on time of departure among the different destinations. They all generally seem to follow the same pattern described above.

The graph above is clearer than the first one, but still cluttered since there is a lot of data. If I had more time I would have liked to group each set of flights by 30-minute or 60-minute periods, or take the mean delay to each destination within these time periods. This would have made the visualization easier to interpret.

Other visualizations/data manipulation I attempted but was not successful with:

I started a heatmap to look at all flights and their departure times but realized that it would not work with the data I had intended to use.

#flights_nona <- flights_nona[order(flights_nona$dest),]
#row.names(flights_nona) <- flights_nona$sched_dep_time
#flights_nona_matrix <- data.matrix(flights_nona)
#flights_nona_heatmap <- heatmap(flights_nona, 
                       #Rowv=NA, 
                       #Colv=NA, 
                       #col = cm.colors(20), 
                       #scale="column", 
                       #margins=c(5,10),
                       #xlab = "Scheduled Time of Departure",
                       #ylab = "Destination",
                       #main = "NYC Flight Delays by Time of Departure in 2013")

I was also unable to create a heatmap using the top 10 destinations for the same reason.

#top10_nona <- top10_nona[order(top10_nona$sched_dep_time),]
#top10_nona <- top10_nona[6,14]
#top10_nona_matrix <- data.matrix(top10_nona)
#top10_nona_heatmap <- heatmap(top10_nona_matrix, 
                       #Rowv=NA, 
                       #Colv=NA, 
                      #col = cm.colors(20), 
                       #scale="column", 
                       #margins=c(5,10),
                       #xlab = "Destination",
                       #ylab = "Scheduled Time of Departure",
                       #main = "NYC Flight Delays by Time of Departure in 2013")

I made a treemap but it is confusing to me, even though I was the one who made it. It would probably not make sense to anyone else.

library(treemap)

treemap(top10_nona, index="dest", vSize="sched_dep_time", vColor="dep_delay", type="manual",
        palette="RdYlBu")

treemap(top10_nona, index="dest", vSize="dep_delay", vColor="sched_dep_time", type="manual",
        palette="RdYlBu")

I tried to make a streamgraph but couldn’t get it to display properly.

devtools::install_github("hrbrmstr/streamgraph") 
Skipping install of 'streamgraph' from a github remote, the SHA1 (76f7173e) has not changed since last install.
  Use `force = TRUE` to force installation
library(streamgraph)
plot2 <- top10_nona |>
  group_by(sched_dep_time, dest, dep_delay)|>
  tally(wt=dep_delay) |>
  streamgraph("dest", "dep_delay", "sched_dep_time")
  
plot2
Warning in widget_html(name, package, id = x$id, style = css(width =
validateCssUnit(sizeInfo$width), : streamgraph_html returned an object of class
`list` instead of a `shiny.tag`.
Warning: `bindFillRole()` only works on htmltools::tag() objects (e.g., div(),
p(), etc.), not objects of type 'list'.

I also wanted to plot an alluvial but I was not able to do so.

library(alluvial)
library(ggalluvial)

Check if the data is in alluvia form

top10_nona_all <- as.data.frame(top10_nona)
head(top10_nona_all)
  year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
1 2013     1   1      542            540         2      923            850
2 2013     1   1      554            600        -6      812            837
3 2013     1   1      554            558        -4      740            728
4 2013     1   1      555            600        -5      913            854
5 2013     1   1      557            600        -3      838            846
6 2013     1   1      558            600        -2      753            745
  arr_delay carrier flight tailnum origin dest air_time distance hour minute
1        33      AA   1141  N619AA    JFK  MIA      160     1089    5     40
2       -25      DL    461  N668DN    LGA  ATL      116      762    6      0
3        12      UA   1696  N39463    EWR  ORD      150      719    5     58
4        19      B6    507  N516JB    EWR  FLL      158     1065    6      0
5        -8      B6     79  N593JB    JFK  MCO      140      944    6      0
6         8      AA    301  N3ALAA    LGA  ORD      138      733    6      0
            time_hour
1 2013-01-01 05:00:00
2 2013-01-01 06:00:00
3 2013-01-01 05:00:00
4 2013-01-01 06:00:00
5 2013-01-01 06:00:00
6 2013-01-01 06:00:00
is_alluvia_form(top10_nona_all, weight = "dep_delay")
Missing alluvia for some stratum combinations.
[1] TRUE

In lode form

top10_nona_lodes <- to_lodes_form(top10_nona_all,
                                  key = "sched_dep_time", value = "dep_delay", id = "dest")

Plot the alluvial

ggalluv <- top10_nona |>
  ggplot(aes(x = sched_dep_time, y = dep_delay, alluvium = dest)) + 
  theme_bw() +
  geom_alluvium(aes(x = dep_delay, y = sched_dep_time, fill = dest), 
                color = "white",
                width = .1, 
                alpha = .8,
                decreasing = FALSE) +
  scale_fill_brewer(palette = "Spectral") + 
  scale_x_continuous(lim = c(540, 2255)) +
  labs(title = "Flight Delays by Time of Departure\n (Top 10 Destinations from NYC Airports)",
       y = "Scheduled Departure Time", 
       fill = "Destination",
       caption = "Source: NYCFlights13 Dataset")

{r}# ggalluv

(Tried this earlier when exploring data but did not use this dataframe)

Create a dataframe with summary statistics. I want to look at whether scheduled departure time, destination, and the mean delay of departure might be related.

{r}# timedelays <- top10_nona |> select(sched_dep_time, dest, dep_delay) |> group_by(sched_dep_time) |> summarise (meandelay = mean(dep_delay))