library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(RColorBrewer)
library(tidyr)
library(treemap)
flights <- flights
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights_nona <- flights %>%
filter(!is.na(distance) & !is.na(arr_delay)) # remove observations with NA values - notice number of rows changed from 336,776 to 327,346
flight2 <- flights_nona %>%
group_by(carrier)%>%
summarise(count=n(),
airtime = mean(air_time),
depdelay = mean(dep_delay),
dist = mean(distance))
flight2 <- arrange(flight2, desc(count))
flight2
## # A tibble: 16 x 5
## carrier count airtime depdelay dist
## <chr> <int> <dbl> <dbl> <dbl>
## 1 UA 57782 212. 12.0 1531.
## 2 B6 54049 151. 13.0 1070.
## 3 EV 51108 90.1 19.8 563.
## 4 DL 47658 174. 9.22 1238.
## 5 AA 31947 189. 8.57 1343.
## 6 MQ 25037 91.2 10.4 570.
## 7 US 19831 88.6 3.74 561.
## 8 9E 17294 86.8 16.4 530.
## 9 WN 12044 148. 17.7 997.
## 10 VX 5116 337. 12.8 2499.
## 11 FL 3175 101. 18.6 665.
## 12 AS 709 326. 5.83 2402
## 13 F9 681 230. 20.2 1620
## 14 YV 544 65.7 18.9 376.
## 15 HA 342 623. 4.90 4983
## 16 OO 29 83.5 12.6 509.
toptobottom <- flight2 %>%
head(100) %>%
arrange(count) # sort ascending so the heatmap displays descending flights
row.names(toptobottom) <- toptobottom$carrier # rename the rows according to carriers
## Warning: Setting row names on a tibble is deprecated.
row.names(toptobottom) <- toptobottom$carrier
## Warning: Setting row names on a tibble is deprecated.
flight2_matrix <- data.matrix(toptobottom)
flight2_matrix2 <- flight2_matrix[,2:5]
varcols = setNames(colorRampPalette(brewer.pal(nrow(flight2_matrix2), "BuPu"))(nrow(flight2_matrix2)),
rownames(flight2_matrix2)) # parameter for RowSideColors
## Warning in brewer.pal(nrow(flight2_matrix2), "BuPu"): n too large, allowed maximum for palette BuPu is 9
## Returning the palette you asked for with that many colors
flight2_heatmap <- heatmap(flight2_matrix2, Rowv=NA, Colv=NA,
col= colorRampPalette(brewer.pal(nrow(flight2_matrix2), "BuPu"))(nrow(flight2_matrix2)),
s=0.6, v=1,
scale="column",margins=c(8,5),
xlab="Flight Characteristics",ylab="Name of Airline",
main="Exploration of each Airline",
labCol = c("Flights","AirTime","Delay","Distant"),
cexCol=1, cexRow =1, RowSideColors = varcols)
## layout: widths = 0.05 0.2 4 , heights = 0.25 4 ; lmat=
## [,1] [,2] [,3]
## [1,] 0 0 4
## [2,] 3 1 2
## Warning in brewer.pal(nrow(flight2_matrix2), "BuPu"): n too large, allowed maximum for palette BuPu is 9
## Returning the palette you asked for with that many colors
I picked four different variables to visualize the relationships of each airline between the number of flights, air time, delay, and distance using heatmap. I wanted to know which airlines have the highest number of flights. I also chose air time, delay, and distance to see the relationship between all of them. On my heat map, it shows that the airlines that have the longest distance seemed to have the slightest delay. I assume that there is a chance that longer flights can make up their time for their delays.