Load the libraries and view the “flights” dataset

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(RColorBrewer)
library(tidyr)
library(treemap)
flights <- flights
flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Remove observations with NA values

flights_nona <- flights %>%
  filter(!is.na(distance) & !is.na(arr_delay))   # remove observations with NA values - notice number of rows changed from 336,776 to 327,346

Grouping data by carrier

flight2 <- flights_nona %>%
  group_by(carrier)%>%
  summarise(count=n(),
            airtime = mean(air_time),
            depdelay = mean(dep_delay),
            dist = mean(distance))
flight2 <- arrange(flight2, desc(count))
flight2
## # A tibble: 16 x 5
##    carrier count airtime depdelay  dist
##    <chr>   <int>   <dbl>    <dbl> <dbl>
##  1 UA      57782   212.     12.0  1531.
##  2 B6      54049   151.     13.0  1070.
##  3 EV      51108    90.1    19.8   563.
##  4 DL      47658   174.      9.22 1238.
##  5 AA      31947   189.      8.57 1343.
##  6 MQ      25037    91.2    10.4   570.
##  7 US      19831    88.6     3.74  561.
##  8 9E      17294    86.8    16.4   530.
##  9 WN      12044   148.     17.7   997.
## 10 VX       5116   337.     12.8  2499.
## 11 FL       3175   101.     18.6   665.
## 12 AS        709   326.      5.83 2402 
## 13 F9        681   230.     20.2  1620 
## 14 YV        544    65.7    18.9   376.
## 15 HA        342   623.      4.90 4983 
## 16 OO         29    83.5    12.6   509.

Flipping our data

toptobottom <- flight2 %>%                  
  head(100) %>%
  arrange(count)                  # sort ascending so the heatmap displays descending flights
row.names(toptobottom) <- toptobottom$carrier      # rename the rows according to carriers
## Warning: Setting row names on a tibble is deprecated.

Create a matrix from the dataframe

row.names(toptobottom) <- toptobottom$carrier
## Warning: Setting row names on a tibble is deprecated.

Put our data into matrix

flight2_matrix <- data.matrix(toptobottom)
flight2_matrix2 <- flight2_matrix[,2:5]

Create a color vector(save it for our legend later)

varcols = setNames(colorRampPalette(brewer.pal(nrow(flight2_matrix2), "BuPu"))(nrow(flight2_matrix2)), 
                   rownames(flight2_matrix2))   # parameter for RowSideColors
## Warning in brewer.pal(nrow(flight2_matrix2), "BuPu"): n too large, allowed maximum for palette BuPu is 9
## Returning the palette you asked for with that many colors

It is time to create heatmap now

flight2_heatmap <- heatmap(flight2_matrix2, Rowv=NA, Colv=NA,
                           col= colorRampPalette(brewer.pal(nrow(flight2_matrix2), "BuPu"))(nrow(flight2_matrix2)),
                           s=0.6, v=1,
                           scale="column",margins=c(8,5),
                           xlab="Flight Characteristics",ylab="Name of Airline",
                           main="Exploration of each Airline", 
                           labCol = c("Flights","AirTime","Delay","Distant"), 
                           cexCol=1, cexRow =1, RowSideColors = varcols)
## layout: widths =  0.05 0.2 4 , heights =  0.25 4 ; lmat=
##      [,1] [,2] [,3]
## [1,]    0    0    4
## [2,]    3    1    2
## Warning in brewer.pal(nrow(flight2_matrix2), "BuPu"): n too large, allowed maximum for palette BuPu is 9
## Returning the palette you asked for with that many colors

I picked four different variables to visualize the relationships of each airline between the number of flights, air time, delay, and distance using heatmap. I wanted to know which airlines have the highest number of flights. I also chose air time, delay, and distance to see the relationship between all of them. On my heat map, it shows that the airlines that have the longest distance seemed to have the slightest delay. I assume that there is a chance that longer flights can make up their time for their delays.