Breakout Room 5:

Import Data

data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

Cek missing values

flights %>%
    summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <int>    <int>          <int>
## 1     0     0     0     8255              0      8255     8713              0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## #   tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## #   hour <int>, minute <int>, time_hour <int>
flights %>%
  filter(is.na(dep_time))
## # A tibble: 8,255 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # ℹ 8,245 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

terdapat banyak data yang merupakan NA salah satunya dep_time yang memiliki 8255 nilai NA

Drop nilai NA

flights = flights %>%
  drop_na()

Cek duplicate rows

flights %>%
  duplicated() %>%
  sum()
## [1] 0
flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

tidak ada data yang duplikat

Cek outliers

Boxplot method

ggplot(flights, aes(x = origin, y = distance, fill = origin)) +
  geom_boxplot(outlier.color = 'purple')

terlihat beberapa outlier pada variabel distance di origin EWR dan JFK

IQR method (JFK)

flights_JFK = flights %>%
  filter(origin == "JFK")

Q1 = quantile(flights_JFK$distance, 0.25)
Q3 = quantile(flights_JFK$distance, 0.75)
IQR = Q3 - Q1

outlier_JFK = flights_JFK %>%
  filter(distance < (Q1 - 1.5 * IQR) |
         distance > (Q3 + 1.5 * IQR))
outlier_JFK
## # A tibble: 342 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      857            900        -3     1516           1530
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     3      914            900        14     1504           1530
##  4  2013     1     4      900            900         0     1516           1530
##  5  2013     1     5      858            900        -2     1519           1530
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8      901            900         1     1504           1530
##  9  2013     1     9      641            900      1301     1242           1530
## 10  2013     1    10      859            900        -1     1449           1530
## # ℹ 332 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Ternyata jarak dari JFK ke HNL memang jauh sehingga dibiarkan

IQR method (EWR)

flights_EWR = flights %>%
  filter(origin == "EWR")

Q1 = quantile(flights_EWR$distance, 0.25)
Q3 = quantile(flights_EWR$distance, 0.75)
IQR = Q3 - Q1

outlier_EWR = flights_EWR %>%
  filter(distance < (Q1 - 1.5 * IQR) |
         distance > (Q3 + 1.5 * IQR))
outlier_EWR
## # A tibble: 367 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1344           1344         0     2005           1944
##  2  2013     1     2     1344           1344         0     1940           1944
##  3  2013     1     3     1418           1341        37     2006           1935
##  4  2013     1     4     1343           1341         2     1932           1935
##  5  2013     1     5     1329           1335        -6     1850           1935
##  6  2013     1     6     1353           1341        12     1927           1941
##  7  2013     1     7     1340           1341        -1     1925           1935
##  8  2013     1     8     1344           1341         3     1951           1935
##  9  2013     1     9     1340           1341        -1     2019           1935
## 10  2013     1    10     1342           1341         1     1935           1935
## # ℹ 357 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Ternyata jarak dari EWR ke HNL dan ANC memang jauh sehingga dibiarkan

IQR method (LGA)

flights_LGA = flights %>%
  filter(origin == "LGA")

Q1 = quantile(flights_LGA$distance, 0.25)
Q3 = quantile(flights_LGA$distance, 0.75)
IQR = Q3 - Q1

outlier_LGA = flights_LGA %>%
  filter(distance < (Q1 - 1.5 * IQR) |
         distance > (Q3 + 1.5 * IQR))
outlier_LGA
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Di LGA tidak ada outlier

Kesimpulan: keep outlier

Cek inconsistent categories

flights %>% count(origin)
## # A tibble: 3 × 2
##   origin      n
##   <chr>   <int>
## 1 EWR    117127
## 2 JFK    109079
## 3 LGA    101140
flights %>% count(dest)
## # A tibble: 104 × 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ℹ 94 more rows

Semua merupakan kode 3-huruf bandara, konsisten

Cek incorrect data types

flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
##   year    month  day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <chr>   <chr>  <chr> <chr>    <chr>          <chr>     <chr>    <chr>         
## 1 integer integ… inte… integer  integer        numeric   integer  integer       
## 2 integer integ… inte… integer  integer        numeric   integer  integer       
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## #   hour <chr>, minute <chr>, time_hour <chr>

setelah membaca di internet ternyata flight merupakan kode seri dari pesawat, sehingga seharusnya diganti dari integer menjadi character

Ganti datatype

flights <- flights %>%
  mutate(flight = as.character(flight))

flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
##   year    month  day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <chr>   <chr>  <chr> <chr>    <chr>          <chr>     <chr>    <chr>         
## 1 integer integ… inte… integer  integer        numeric   integer  integer       
## 2 integer integ… inte… integer  integer        numeric   integer  integer       
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## #   hour <chr>, minute <chr>, time_hour <chr>

Cek logical inconsistencies

flights %>%
  filter(distance < 0 | air_time < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

di distance dan air_time tidak ada data yang bernilai negatif, sehingga datanya masuk akal

Data bersih

summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 905  
##  Median :2013   Median : 7.000   Median :16.00   Median :1400   Median :1355  
##  Mean   :2013   Mean   : 6.565   Mean   :15.74   Mean   :1349   Mean   :1340  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1122   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1554   Median :  -5.000  
##  Mean   :  12.56   Mean   :1502   Mean   :1533   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1944   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##    carrier             flight            tailnum             origin         
##  Length:327346      Length:327346      Length:327346      Length:327346     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      dest              air_time        distance         hour      
##  Length:327346      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 509   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 888   Median :13.00  
##                     Mean   :150.7   Mean   :1048   Mean   :13.14  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-05 06:00:00.00  
##  Median :29.00   Median :2013-07-04 09:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 17:56:45.44  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 18:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00