NYC flights homework

Load the libraries

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
library(RColorBrewer)

Load the data

data("flights")

view some of the data

head(flights)
# A tibble: 6 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2023     1     1        1           2038       203      328              3
2  2023     1     1       18           2300        78      228            135
3  2023     1     1       31           2344        47      500            426
4  2023     1     1       33           2140       173      238           2352
5  2023     1     1       36           2048       228      223           2252
6  2023     1     1      503            500         3      808            815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
summary(flights)
      year          month             day           dep_time     sched_dep_time
 Min.   :2023   Min.   : 1.000   Min.   : 1.00   Min.   :   1    Min.   : 500  
 1st Qu.:2023   1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 931    1st Qu.: 930  
 Median :2023   Median : 6.000   Median :16.00   Median :1357    Median :1359  
 Mean   :2023   Mean   : 6.423   Mean   :15.74   Mean   :1366    Mean   :1364  
 3rd Qu.:2023   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:1804    3rd Qu.:1759  
 Max.   :2023   Max.   :12.000   Max.   :31.00   Max.   :2400    Max.   :2359  
                                                 NA's   :10738                 
   dep_delay          arr_time     sched_arr_time   arr_delay       
 Min.   : -50.00   Min.   :   1    Min.   :   1   Min.   : -97.000  
 1st Qu.:  -6.00   1st Qu.:1105    1st Qu.:1135   1st Qu.: -22.000  
 Median :  -2.00   Median :1519    Median :1551   Median : -10.000  
 Mean   :  13.84   Mean   :1497    Mean   :1552   Mean   :   4.345  
 3rd Qu.:  10.00   3rd Qu.:1946    3rd Qu.:2007   3rd Qu.:   9.000  
 Max.   :1813.00   Max.   :2400    Max.   :2359   Max.   :1812.000  
 NA's   :10738     NA's   :11453                  NA's   :12534     
   carrier              flight         tailnum             origin         
 Length:435352      Min.   :   1.0   Length:435352      Length:435352     
 Class :character   1st Qu.: 364.0   Class :character   Class :character  
 Mode  :character   Median : 734.0   Mode  :character   Mode  :character  
                    Mean   : 785.2                                        
                    3rd Qu.:1188.0                                        
                    Max.   :1972.0                                        
                                                                          
     dest              air_time        distance           hour      
 Length:435352      Min.   : 18.0   Min.   :  80.0   Min.   : 5.00  
 Class :character   1st Qu.: 77.0   1st Qu.: 479.0   1st Qu.: 9.00  
 Mode  :character   Median :121.0   Median : 762.0   Median :13.00  
                    Mean   :141.8   Mean   : 977.5   Mean   :13.35  
                    3rd Qu.:177.0   3rd Qu.:1182.0   3rd Qu.:17.00  
                    Max.   :701.0   Max.   :4983.0   Max.   :23.00  
                    NA's   :12534                                   
     minute        time_hour                     
 Min.   : 0.00   Min.   :2023-01-01 05:00:00.00  
 1st Qu.:10.00   1st Qu.:2023-03-30 20:00:00.00  
 Median :29.00   Median :2023-06-27 08:00:00.00  
 Mean   :28.53   Mean   :2023-06-29 10:02:22.39  
 3rd Qu.:45.00   3rd Qu.:2023-09-27 11:00:00.00  
 Max.   :59.00   Max.   :2023-12-31 23:00:00.00  
                                                 

carrier names

carrier_names <- c("UA"= "United Airlines",
                   "AA"= "American Airlines",
                   "DL"= "Delta Airlines",
                   "B6"= "Jet Blue Airways",
                   "F9"= "Frontier Arilines",
                   "HA"= "Hawaiian Airlines",
                   "NK"= "Spirit Airlines",
                   "AS"= "Alaska Airlines",
                   "WN"= "Southwest Airlines")

##Filter the datasets

flights2 <-flights %>%
  filter(distance <1000) %>%
  filter(carrier %in% c("United Airlines", "American Airlines", "Delta Airlnes"," Jet Blue", "Frontier Airlines"))
flights2$carrier[flights2$carrier == "UA" ]<-"United Airlines"
flights2$carrier[flights2$carrier == "AA" ]<-"American Airlines"
flights2$carrier[flights2$carrier == "DL" ]<-"Delta Airlines"
flights2$carrier[flights2$carrier == "B6" ]<-"Jet Blue"
flights$carrier[flights2$carrier == "F9" ]<-"Frontier Airlines"

summarization

summary(flights2)
      year         month          day         dep_time   sched_dep_time
 Min.   : NA   Min.   : NA   Min.   : NA   Min.   : NA   Min.   : NA   
 1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   
 Median : NA   Median : NA   Median : NA   Median : NA   Median : NA   
 Mean   :NaN   Mean   :NaN   Mean   :NaN   Mean   :NaN   Mean   :NaN   
 3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   
 Max.   : NA   Max.   : NA   Max.   : NA   Max.   : NA   Max.   : NA   
   dep_delay      arr_time   sched_arr_time   arr_delay     carrier         
 Min.   : NA   Min.   : NA   Min.   : NA    Min.   : NA   Length:0          
 1st Qu.: NA   1st Qu.: NA   1st Qu.: NA    1st Qu.: NA   Class :character  
 Median : NA   Median : NA   Median : NA    Median : NA   Mode  :character  
 Mean   :NaN   Mean   :NaN   Mean   :NaN    Mean   :NaN                     
 3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA    3rd Qu.: NA                     
 Max.   : NA   Max.   : NA   Max.   : NA    Max.   : NA                     
     flight      tailnum             origin              dest          
 Min.   : NA   Length:0           Length:0           Length:0          
 1st Qu.: NA   Class :character   Class :character   Class :character  
 Median : NA   Mode  :character   Mode  :character   Mode  :character  
 Mean   :NaN                                                           
 3rd Qu.: NA                                                           
 Max.   : NA                                                           
    air_time      distance        hour         minute      time_hour  
 Min.   : NA   Min.   : NA   Min.   : NA   Min.   : NA   Min.   :NA   
 1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   1st Qu.:NA   
 Median : NA   Median : NA   Median : NA   Median : NA   Median :NA   
 Mean   :NaN   Mean   :NaN   Mean   :NaN   Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.:NA   
 Max.   : NA   Max.   : NA   Max.   : NA   Max.   : NA   Max.   :NA   
p1 <- ggplot(flights2, aes(x = carrier, y = dep_delay,  fill = dest)) + 
  labs(x= "NYC departure flight", y = "Depature delay", 
       title = "Side-by-Side Boxplot of New york carrier's",
       caption = "New York State Department of Flights") +
  geom_boxplot() +
  scale_fill_discrete(name = "Destinations", labels = c("United Airline", "American Airline","Delta Airlines", "Jet blue Airline", "Spirit Airlines"))
  p1