Install packages (“nycflights13”)

load packages

library(dplyr)
library(ggplot2)
library(tidyverse)
library(nycflights13)
data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

check missing value

flights %>%
  summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <int>    <int>          <int>
## 1     0     0     0     8255              0      8255     8713              0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## #   tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## #   hour <int>, minute <int>, time_hour <int>

check duplicates

flights %>%
  duplicated() %>%
  sum()
## [1] 0

view duplicates

flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

check outlier

#Boxplot
ggplot(flights, aes(x = origin, y = air_time )) +
  geom_boxplot()+
  labs(title = "Boxplot of Air Time by Origin", x = "Origin", y = "Air Time")
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

IQR Method

Q1 <- quantile(flights$air_time, 0.25, na.rm = TRUE)
Q3 <- quantile(flights$air_time, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
outlier_air_time <- flights %>%
  filter(air_time < (Q1 - 1.5 * IQR) |
           air_time > (Q3 + 1.5 * IQR))
nrow(outlier_air_time)
## [1] 5448

check inconsistent categories

flights %>% count(carrier)
## # A tibble: 16 × 2
##    carrier     n
##    <chr>   <int>
##  1 9E      18460
##  2 AA      32729
##  3 AS        714
##  4 B6      54635
##  5 DL      48110
##  6 EV      54173
##  7 F9        685
##  8 FL       3260
##  9 HA        342
## 10 MQ      26397
## 11 OO         32
## 12 UA      58665
## 13 US      20536
## 14 VX       5162
## 15 WN      12275
## 16 YV        601
flights %>% count(origin)
## # A tibble: 3 × 2
##   origin      n
##   <chr>   <int>
## 1 EWR    120835
## 2 JFK    111279
## 3 LGA    104662
flights %>% count(dest)
## # A tibble: 105 × 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     265
##  3 ALB     439
##  4 ANC       8
##  5 ATL   17215
##  6 AUS    2439
##  7 AVL     275
##  8 BDL     443
##  9 BGR     375
## 10 BHM     297
## # ℹ 95 more rows
flights %>% count(tailnum)
## # A tibble: 4,044 × 2
##    tailnum     n
##    <chr>   <int>
##  1 D942DN      4
##  2 N0EGMQ    371
##  3 N10156    153
##  4 N102UW     48
##  5 N103US     46
##  6 N104UW     47
##  7 N10575    289
##  8 N105UW     45
##  9 N107US     41
## 10 N108UW     60
## # ℹ 4,034 more rows

check incorrect data types

flights%>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
##   year    month  day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <chr>   <chr>  <chr> <chr>    <chr>          <chr>     <chr>    <chr>         
## 1 integer integ… inte… integer  integer        numeric   integer  integer       
## 2 integer integ… inte… integer  integer        numeric   integer  integer       
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## #   hour <chr>, minute <chr>, time_hour <chr>

check logical inconsistencies

flights%>%
  filter(air_time < 0 | dep_delay < -100)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

air_time < 0 : waktu yang dibutuhkan untuk penerbangan tidak mungkin negatif

dep_delay < -100 : pesawat seharusnya tidak berangkat lebih cepat dari 100 menit

tidak ditemukan logical inconsistencies

clean up dataset

flights_cleaned <- flights %>%
  drop_na()

summary(flights_cleaned)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 905  
##  Median :2013   Median : 7.000   Median :16.00   Median :1400   Median :1355  
##  Mean   :2013   Mean   : 6.565   Mean   :15.74   Mean   :1349   Mean   :1340  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1122   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1554   Median :  -5.000  
##  Mean   :  12.56   Mean   :1502   Mean   :1533   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1944   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##    carrier              flight       tailnum             origin         
##  Length:327346      Min.   :   1   Length:327346      Length:327346     
##  Class :character   1st Qu.: 544   Class :character   Class :character  
##  Mode  :character   Median :1467   Mode  :character   Mode  :character  
##                     Mean   :1943                                        
##                     3rd Qu.:3412                                        
##                     Max.   :8500                                        
##      dest              air_time        distance         hour      
##  Length:327346      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 509   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 888   Median :13.00  
##                     Mean   :150.7   Mean   :1048   Mean   :13.14  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-05 06:00:00.00  
##  Median :29.00   Median :2013-07-04 09:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 17:56:45.44  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 18:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00