NYCFlights HW

Author

Nathaniel Nguyen

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(ggplot2)
str(flights)
tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
 $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
 $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
 $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
 $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
 $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
 $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
 $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
 $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
 $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
 $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
 $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
 $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
 $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
 $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
 $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
 $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
head(flights)
# A tibble: 6 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013     1     1      517            515         2      830            819
2  2013     1     1      533            529         4      850            830
3  2013     1     1      542            540         2      923            850
4  2013     1     1      544            545        -1     1004           1022
5  2013     1     1      554            600        -6      812            837
6  2013     1     1      554            558        -4      740            728
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
AA_Delays_In2013 <- flights |>
  filter(dep_delay >= 1 & year == 2013 & carrier == "AA")
AA_Delays_In2013
# A tibble: 10,162 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      542            540         2      923            850
 2  2013     1     1      623            610        13      920            915
 3  2013     1     1      743            730        13     1107           1100
 4  2013     1     1      826            715        71     1136           1045
 5  2013     1     1      909            810        59     1331           1315
 6  2013     1     1      912            900        12     1241           1220
 7  2013     1     1     1025           1020         5     1356           1330
 8  2013     1     1     1037           1030         7     1221           1210
 9  2013     1     1     1130           1125         5     1301           1305
10  2013     1     1     1252           1245         7     1624           1550
# ℹ 10,152 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
AA_Delays_In2013$month[AA_Delays_In2013$month == 1]<- "January"
AA_Delays_In2013$month[AA_Delays_In2013$month == 2]<- "Febuary"
AA_Delays_In2013$month[AA_Delays_In2013$month == 3]<- "March"
AA_Delays_In2013$month[AA_Delays_In2013$month == 4]<- "April"
AA_Delays_In2013$month[AA_Delays_In2013$month == 5]<- "May"
AA_Delays_In2013$month[AA_Delays_In2013$month == 6]<- "June"
AA_Delays_In2013$month[AA_Delays_In2013$month == 7]<- "July"
AA_Delays_In2013$month[AA_Delays_In2013$month == 8]<- "August"
AA_Delays_In2013$month[AA_Delays_In2013$month == 9]<- "September"
AA_Delays_In2013$month[AA_Delays_In2013$month == 10]<- "October"
AA_Delays_In2013$month[AA_Delays_In2013$month == 11]<- "November"
AA_Delays_In2013$month[AA_Delays_In2013$month == 12]<- "Decemeber"
Delay_Histogram <- AA_Delays_In2013 %>%
  ggplot() + 
  geom_histogram(mapping = aes(x = month, y = dep_delay, fill = month, color = "red"),
           position = "identity", stat = "identity") +
  labs(fill = "Month",
       y = "Frequency of Delays",
       title = "Histogram of American Airlines Delays",
       caption = "All data comes from the NYCFlights dataset")
Warning in geom_histogram(mapping = aes(x = month, y = dep_delay, fill = month,
: Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Delay_Histogram

My data visualization is a histogram that displays the highlights the frequency of delays and what months they occur in. All data was collected using the flights package in the NYCFlights13 dataset. What I tried to do was I used filter to filter only the flights from American Airlines, in 2013, that had a departure delay. By using the dep_delay, year, and carrier functions I was able to do this. I then used ggplot to create a histogram that displayed the months and the amount of flights per month that had delays. I also included a legend with labels to help clearly display the colors and what months go with which colors. I also used the lab function to label the two axis, the visualization’s title, and the caption. Finally, I used fill to color in the bars and I used color to create the colored outlines around the bars.