nycflights

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
library(RColorBrewer)
summary(flights)
      year          month             day           dep_time     sched_dep_time
 Min.   :2023   Min.   : 1.000   Min.   : 1.00   Min.   :   1    Min.   : 500  
 1st Qu.:2023   1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 931    1st Qu.: 930  
 Median :2023   Median : 6.000   Median :16.00   Median :1357    Median :1359  
 Mean   :2023   Mean   : 6.423   Mean   :15.74   Mean   :1366    Mean   :1364  
 3rd Qu.:2023   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:1804    3rd Qu.:1759  
 Max.   :2023   Max.   :12.000   Max.   :31.00   Max.   :2400    Max.   :2359  
                                                 NA's   :10738                 
   dep_delay          arr_time     sched_arr_time   arr_delay       
 Min.   : -50.00   Min.   :   1    Min.   :   1   Min.   : -97.000  
 1st Qu.:  -6.00   1st Qu.:1105    1st Qu.:1135   1st Qu.: -22.000  
 Median :  -2.00   Median :1519    Median :1551   Median : -10.000  
 Mean   :  13.84   Mean   :1497    Mean   :1552   Mean   :   4.345  
 3rd Qu.:  10.00   3rd Qu.:1946    3rd Qu.:2007   3rd Qu.:   9.000  
 Max.   :1813.00   Max.   :2400    Max.   :2359   Max.   :1812.000  
 NA's   :10738     NA's   :11453                  NA's   :12534     
   carrier              flight         tailnum             origin         
 Length:435352      Min.   :   1.0   Length:435352      Length:435352     
 Class :character   1st Qu.: 364.0   Class :character   Class :character  
 Mode  :character   Median : 734.0   Mode  :character   Mode  :character  
                    Mean   : 785.2                                        
                    3rd Qu.:1188.0                                        
                    Max.   :1972.0                                        
                                                                          
     dest              air_time        distance           hour      
 Length:435352      Min.   : 18.0   Min.   :  80.0   Min.   : 5.00  
 Class :character   1st Qu.: 77.0   1st Qu.: 479.0   1st Qu.: 9.00  
 Mode  :character   Median :121.0   Median : 762.0   Median :13.00  
                    Mean   :141.8   Mean   : 977.5   Mean   :13.35  
                    3rd Qu.:177.0   3rd Qu.:1182.0   3rd Qu.:17.00  
                    Max.   :701.0   Max.   :4983.0   Max.   :23.00  
                    NA's   :12534                                   
     minute        time_hour                     
 Min.   : 0.00   Min.   :2023-01-01 05:00:00.00  
 1st Qu.:10.00   1st Qu.:2023-03-30 20:00:00.00  
 Median :29.00   Median :2023-06-27 08:00:00.00  
 Mean   :28.53   Mean   :2023-06-29 10:02:22.39  
 3rd Qu.:45.00   3rd Qu.:2023-09-27 11:00:00.00  
 Max.   :59.00   Max.   :2023-12-31 23:00:00.00  
                                                 
flightdelay <- flights |>
  select("month","day",'dep_delay' , "carrier",)
flightdelay
# A tibble: 435,352 × 4
   month   day dep_delay carrier
   <int> <int>     <dbl> <chr>  
 1     1     1       203 UA     
 2     1     1        78 DL     
 3     1     1        47 B6     
 4     1     1       173 B6     
 5     1     1       228 UA     
 6     1     1         3 AA     
 7     1     1        10 B6     
 8     1     1        -6 AA     
 9     1     1        17 UA     
10     1     1         2 NK     
# ℹ 435,342 more rows
AAdelay <- flightdelay |> 
  filter(carrier == "AA" & month == "1") |> 
  mutate(delay_category = case_when(
    dep_delay < 0 ~ "Early",
    dep_delay == 0 ~ "On time",
    dep_delay > 0 ~ "Delayed")) |>
  na.omit(data$dep_delay, data$delay_category)


AAdelay
# A tibble: 3,510 × 5
   month   day dep_delay carrier delay_category
   <int> <int>     <dbl> <chr>   <chr>         
 1     1     1         3 AA      Delayed       
 2     1     1        -6 AA      Early         
 3     1     1        -7 AA      Early         
 4     1     1        -6 AA      Early         
 5     1     1         0 AA      On time       
 6     1     1       -10 AA      Early         
 7     1     1        -3 AA      Early         
 8     1     1        -4 AA      Early         
 9     1     1        -4 AA      Early         
10     1     1        -9 AA      Early         
# ℹ 3,500 more rows
summary(AAdelay)
     month        day          dep_delay         carrier         
 Min.   :1   Min.   : 1.00   Min.   : -19.00   Length:3510       
 1st Qu.:1   1st Qu.: 9.00   1st Qu.:  -7.00   Class :character  
 Median :1   Median :16.00   Median :  -3.00   Mode  :character  
 Mean   :1   Mean   :16.19   Mean   :  14.96                     
 3rd Qu.:1   3rd Qu.:24.00   3rd Qu.:   8.00                     
 Max.   :1   Max.   :31.00   Max.   :1201.00                     
 delay_category    
 Length:3510       
 Class :character  
 Mode  :character  
                   
                   
                   
Plot1 <- AAdelay |> 
  ggplot(aes(x = day, y = dep_delay, color= delay_category)) +
  geom_point(alpha = 0.5) + 
  scale_color_manual(values = c("Early" = "blue", "On time" = "green", "Delayed" = "red"))+ labs(title = "NYC American Airline Departure Delay in January 2023", 
       x = "Day", 
       y = "Departure Delay (Minutes)",
       caption = "Source: FAA Aircraft registry",
       color = "Delay Category")+
  ylim( -100,1250)
Plot1

My visualization is a scatter plot showing how long American Airlines flights in NYC were delayed in January 2023. The blue dots represent flights that departed early, the green dots show on-time flights, and the red dots signify delayed flights. A key observation is that the majority of American Airlines flights in January experienced delays. In my plot, the x-axis represents the days of the month, while the y-axis shows the length of the delay in minutes. This visualization helps illustrate how delays varied throughout the month and how frequently they occurred. One challenge I faced was making the blue dots stand out more to highlight early departures. I wanted them to be more noticeable, but increasing their size or brightness caused the red dots to overlap and clutter the plot. Finding a balance between clarity and appearance was difficult, but overall, I feel as if my scatter plot effectively represents American Airlines’ departure delays in January.