library(nycflights23)Warning: package 'nycflights23' was built under R version 4.4.2
data(flights)library(nycflights23)Warning: package 'nycflights23' was built under R version 4.4.2
data(flights)library(tidyverse)Warning: package 'ggplot2' was built under R version 4.4.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
head(flights)# A tibble: 6 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
library(tidyverse)
library(nycflights23)
library(ggplot2) View(flights)##Display structure of the dataset
str(flights)tibble [435,352 × 19] (S3: tbl_df/tbl/data.frame)
$ year : int [1:435352] 2023 2023 2023 2023 2023 2023 2023 2023 2023 2023 ...
$ month : int [1:435352] 1 1 1 1 1 1 1 1 1 1 ...
$ day : int [1:435352] 1 1 1 1 1 1 1 1 1 1 ...
$ dep_time : int [1:435352] 1 18 31 33 36 503 520 524 537 547 ...
$ sched_dep_time: int [1:435352] 2038 2300 2344 2140 2048 500 510 530 520 545 ...
$ dep_delay : num [1:435352] 203 78 47 173 228 3 10 -6 17 2 ...
$ arr_time : int [1:435352] 328 228 500 238 223 808 948 645 926 845 ...
$ sched_arr_time: int [1:435352] 3 135 426 2352 2252 815 949 710 818 852 ...
$ arr_delay : num [1:435352] 205 53 34 166 211 -7 -1 -25 68 -7 ...
$ carrier : chr [1:435352] "UA" "DL" "B6" "B6" ...
$ flight : int [1:435352] 628 393 371 1053 219 499 996 981 206 225 ...
$ tailnum : chr [1:435352] "N25201" "N830DN" "N807JB" "N265JB" ...
$ origin : chr [1:435352] "EWR" "JFK" "JFK" "JFK" ...
$ dest : chr [1:435352] "SMF" "ATL" "BQN" "CHS" ...
$ air_time : num [1:435352] 367 108 190 108 80 154 192 119 258 157 ...
$ distance : num [1:435352] 2500 760 1576 636 488 ...
$ hour : num [1:435352] 20 23 23 21 20 5 5 5 5 5 ...
$ minute : num [1:435352] 38 0 44 40 48 0 10 30 20 45 ...
$ time_hour : POSIXct[1:435352], format: "2023-01-01 20:00:00" "2023-01-01 23:00:00" ...
##Get summary statistics
summary(flights) year month day dep_time sched_dep_time
Min. :2023 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
1st Qu.:2023 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.: 931 1st Qu.: 930
Median :2023 Median : 6.000 Median :16.00 Median :1357 Median :1359
Mean :2023 Mean : 6.423 Mean :15.74 Mean :1366 Mean :1364
3rd Qu.:2023 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:1804 3rd Qu.:1759
Max. :2023 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
NA's :10738
dep_delay arr_time sched_arr_time arr_delay
Min. : -50.00 Min. : 1 Min. : 1 Min. : -97.000
1st Qu.: -6.00 1st Qu.:1105 1st Qu.:1135 1st Qu.: -22.000
Median : -2.00 Median :1519 Median :1551 Median : -10.000
Mean : 13.84 Mean :1497 Mean :1552 Mean : 4.345
3rd Qu.: 10.00 3rd Qu.:1946 3rd Qu.:2007 3rd Qu.: 9.000
Max. :1813.00 Max. :2400 Max. :2359 Max. :1812.000
NA's :10738 NA's :11453 NA's :12534
carrier flight tailnum origin
Length:435352 Min. : 1.0 Length:435352 Length:435352
Class :character 1st Qu.: 364.0 Class :character Class :character
Mode :character Median : 734.0 Mode :character Mode :character
Mean : 785.2
3rd Qu.:1188.0
Max. :1972.0
dest air_time distance hour
Length:435352 Min. : 18.0 Min. : 80.0 Min. : 5.00
Class :character 1st Qu.: 77.0 1st Qu.: 479.0 1st Qu.: 9.00
Mode :character Median :121.0 Median : 762.0 Median :13.00
Mean :141.8 Mean : 977.5 Mean :13.35
3rd Qu.:177.0 3rd Qu.:1182.0 3rd Qu.:17.00
Max. :701.0 Max. :4983.0 Max. :23.00
NA's :12534
minute time_hour
Min. : 0.00 Min. :2023-01-01 05:00:00.00
1st Qu.:10.00 1st Qu.:2023-03-30 20:00:00.00
Median :29.00 Median :2023-06-27 08:00:00.00
Mean :28.53 Mean :2023-06-29 10:02:22.39
3rd Qu.:45.00 3rd Qu.:2023-09-27 11:00:00.00
Max. :59.00 Max. :2023-12-31 23:00:00.00
##Check for NA values in the departure delay column
sum(is.na(flights$dep_delay))[1] 10738
sum(is.na(flights$arr_delay))[1] 12534
##Group by airline carrier and calculate the mean departure delay
avg_dep_delay <- flights %>%
group_by(carrier) %>%
summarize(mean_dep_delay = mean(dep_delay, na.rm = TRUE))##Create a lookup table for carrier names
carrier_names <- c(
"9E" = "Endeavor Air",
"AA" = "American Airlines",
"AS" = "Alaska Airlines",
"B6" = "JetBlue Airways",
"DL" = "Delta Air Lines",
"EV" = "ExpressJet Airlines",
"F9" = "Frontier Airlines",
"FL" = "AirTran Airways",
"HA" = "Hawaiian Airlines",
"MQ" = "Envoy Air",
"OO" = "SkyWest Airlines",
"UA" = "United Airlines",
"US" = "US Airways",
"VX" = "Virgin America",
"WN" = "Southwest Airlines",
"YV" = "Mesa Airlines"
)##Rename the carrier column using mutate and recode
flights_renamed <- flights %>%
mutate(carrier_full = recode(carrier, !!!carrier_names))head(flights_renamed %>% select(carrier, carrier_full))# A tibble: 6 × 2
carrier carrier_full
<chr> <chr>
1 UA United Airlines
2 DL Delta Air Lines
3 B6 JetBlue Airways
4 B6 JetBlue Airways
5 UA United Airlines
6 AA American Airlines
##Create a bar chart of Average Departure Delay by Airline Carrier
ggplot(avg_dep_delay, aes(x = reorder(carrier, -mean_dep_delay), y = mean_dep_delay, fill = carrier)) +
geom_bar(stat = "identity") +
scale_fill_viridis_d() +
labs(
title = "Average Departure Delay by Airline Carrier",
x = "Airline Carrier",
y = "Average Departure Delay (minutes)",
caption = "Data Source: nycflights23"
) +
theme_minimal() +
theme(legend.title = element_blank()) heatmap_data <- flights %>%
group_by(carrier, month) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "keep")##Create the heatmap
ggplot(heatmap_data, aes(x = month, y = carrier, fill = mean_dep_delay)) +
geom_tile() +
scale_fill_viridis_c(option = "plasma") labs(
title = "Average Departure Delay by Airline and Month",
x = "Month",
y = "Airline Carrier",
caption = "Data Source: nycflights23"
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45),
)NULL
##Bar graph Visualizationd description This plot visualizes the average departure delay by airline carrier, based on data from the nycflights23 dataset. The x-axis represents different airline carriers, while the y-axis shows the average departure delay in minutes. Each bar is filled with a different color, chosen from the Viridis color palette, to make the carriers visually distinct. One important aspect to highlight is that the bars are ordered from the highest to the lowest average departure delay, making it easy to compare how each carrier performs. This allows us to quickly see which airlines have the longest delays and which ones have the shortest. The chart’s minimalist design helps focus on the key data points, making the information clear and easy to understand. The absence of a legend title further simplifies the plot, as it’s already clear which carrier corresponds to which bar based on the x-axis labels.
##heat map Visualization description This heatmap shows the average departure delay for each airline carrier by month, based on data from the nycflights23 dataset. The x-axis represents the months of the year, while the y-axis shows different airline carriers. The color intensity of each tile indicates the average departure delay, with darker colors representing longer delays and lighter colors indicating shorter delays. I used the “plasma” option from the Viridis color palette to make the colors easy to interpret. One key aspect to highlight is how the delays vary by both carrier and month. For example, you can see if certain airlines have more consistent delays throughout the year or if delays are more significant in certain months. The plot’s clean design (using the theme_bw() function) makes the data easy to read, and the x-axis labels are rotated to 45 degrees to prevent overlap and ensure readability. This heatmap allows us to quickly identify patterns in departure delays across different airlines and times of the year.