Anggota Kelompok:

  1. Marvelio Jonathan (5052241017)
  2. Radithya Mahaputra (5052241019)
  3. Najwa Aulia (5052241035)
  4. Annisa Maulida (5052241038)

Data Flights

#data flights
library(nycflights13)
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

Check Missing Values

flights %>%
  summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <int>    <int>          <int>
## 1     0     0     0     8255              0      8255     8713              0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## #   tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## #   hour <int>, minute <int>, time_hour <int>

Data yang memiliki missing values: dep_time, dep_delay, arr_time, arr_delay, tailnum, dan air_time.

Check Duplicate Records

flights %>%
  duplicated() %>%
  sum()
## [1] 0

Tidak terdapat data yang terduplikasi

Check Outliers

Mendeteksi outliers pada data bertipe numerik

ggplot(flights, aes(y = dep_delay)) +
  geom_boxplot() 
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Terdapat outliers pada data dep_delay. Namun, nilai-nilai ekstrem pada dep_delay tetap relevan dengan kondisi dalam dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data tetap dipertahankan.

ggplot(flights, aes(y = arr_delay)) +
  geom_boxplot() 
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Terdapat outliers pada data arr_delay. Namun, nilai-nilai ekstrem pada arr_delay tetap relevan dengan kondisi dalam dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data tetap dipertahankan.

ggplot(flights, aes(y = air_time)) +
  geom_boxplot()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Terdapat outliers pada data air_time. Namun, nilai-nilai ekstrem pada air_time tetap relevan dengan kondisi dalam dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data tetap dipertahankan.

ggplot(flights, aes(y = distance)) +
  geom_boxplot() 

Terdapat outliers pada data distance. Namun, nilai-nilai ekstrem pada distance tetap relevan dengan kondisi dalam dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data tetap dipertahankan.

ggplot(flights, aes(y = hour)) +
  geom_boxplot()

Tidak terdapat outlier pada data hour.

ggplot(flights, aes(y = minute)) +
  geom_boxplot()

Tidak terdapat outlier pada data minute,

Check Inconsistent Categories

Mendeteksi inconsistens categories pada data bertipe string

flights %>% count(carrier)
## # A tibble: 16 × 2
##    carrier     n
##    <chr>   <int>
##  1 9E      18460
##  2 AA      32729
##  3 AS        714
##  4 B6      54635
##  5 DL      48110
##  6 EV      54173
##  7 F9        685
##  8 FL       3260
##  9 HA        342
## 10 MQ      26397
## 11 OO         32
## 12 UA      58665
## 13 US      20536
## 14 VX       5162
## 15 WN      12275
## 16 YV        601

Tidak terdapat inconsistent category pada data carrier.

flights %>% count(tailnum)
## # A tibble: 4,044 × 2
##    tailnum     n
##    <chr>   <int>
##  1 D942DN      4
##  2 N0EGMQ    371
##  3 N10156    153
##  4 N102UW     48
##  5 N103US     46
##  6 N104UW     47
##  7 N10575    289
##  8 N105UW     45
##  9 N107US     41
## 10 N108UW     60
## # ℹ 4,034 more rows

Tidak terdapat inconsistent category pada data tailnum.

flights %>% count(origin)
## # A tibble: 3 × 2
##   origin      n
##   <chr>   <int>
## 1 EWR    120835
## 2 JFK    111279
## 3 LGA    104662

Tidak terdapat inconsistent category pada data origin.

flights %>% count(dest)
## # A tibble: 105 × 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     265
##  3 ALB     439
##  4 ANC       8
##  5 ATL   17215
##  6 AUS    2439
##  7 AVL     275
##  8 BDL     443
##  9 BGR     375
## 10 BHM     297
## # ℹ 95 more rows

Tidak terdapat inconsistent category pada data dest.

flights %>% count(time_hour)
## # A tibble: 6,936 × 2
##    time_hour               n
##    <dttm>              <int>
##  1 2013-01-01 05:00:00     6
##  2 2013-01-01 06:00:00    52
##  3 2013-01-01 07:00:00    49
##  4 2013-01-01 08:00:00    58
##  5 2013-01-01 09:00:00    56
##  6 2013-01-01 10:00:00    39
##  7 2013-01-01 11:00:00    37
##  8 2013-01-01 12:00:00    56
##  9 2013-01-01 13:00:00    54
## 10 2013-01-01 14:00:00    48
## # ℹ 6,926 more rows

Tidak terdapat inconsistent category pada data time_hour.

Check Incorrect Data Types

flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
##   year    month  day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <chr>   <chr>  <chr> <chr>    <chr>          <chr>     <chr>    <chr>         
## 1 integer integ… inte… integer  integer        numeric   integer  integer       
## 2 integer integ… inte… integer  integer        numeric   integer  integer       
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## #   hour <chr>, minute <chr>, time_hour <chr>

Terdapat incorrect data types pada data flights yang seharusnya bertipe data character karena data flights merupakan kode identifikasi pesawat dan bukan merupakan angka untuk dihitung.

Check Logical Inconsistencies

flights %>% filter(distance < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Tidak terdapat logical inconsistencies data distance

flights %>% filter(air_time < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Tidak terdapat logical inconsistencies data air_time

Cleaning Up

Menghapus data yang memiliki missing value

flights_cleaned <- flights %>%
  drop_na() %>%
  mutate(flight = as.character(flight))

summary(flights_cleaned)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 905  
##  Median :2013   Median : 7.000   Median :16.00   Median :1400   Median :1355  
##  Mean   :2013   Mean   : 6.565   Mean   :15.74   Mean   :1349   Mean   :1340  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1122   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1554   Median :  -5.000  
##  Mean   :  12.56   Mean   :1502   Mean   :1533   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1944   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##    carrier             flight            tailnum             origin         
##  Length:327346      Length:327346      Length:327346      Length:327346     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      dest              air_time        distance         hour      
##  Length:327346      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 509   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 888   Median :13.00  
##                     Mean   :150.7   Mean   :1048   Mean   :13.14  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-05 06:00:00  
##  Median :29.00   Median :2013-07-04 09:00:00  
##  Mean   :26.23   Mean   :2013-07-03 17:56:45  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 18:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00