Nama Anggota Kelompok 3:

- Dewa Putra Yudha (5052241012)

- Agata Corinna Aulia Widyawati (5052241036)

- Inessa Regina Angelica Munda (5052241037)

Pendahuluan

Dalam tugas kali ini, kami menggunakan dataset flights dari paket nycflights13. Dataset ini berisikan informasi mengenai penerbangan dari 3 bandara utama di NYC pada tahun 2013. Tujuan dari laporan ini adalah untuk mengidentifikasi masalah yang ada pada dataset dan membersihkannya.

1. Memanggil Packages

library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3

2. Memanggil dataset flights dan menampilkan struktur dan nilai dari flights

data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

3. Cek Missing Values

flights %>%
  summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <int>    <int>          <int>
## 1     0     0     0     8255              0      8255     8713              0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## #   tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## #   hour <int>, minute <int>, time_hour <int>

4. Cek Duplikasi

flights %>%
  duplicated() %>%
  sum()
## [1] 0
flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

5. Cek Outlier

5.1 Boxplot Sebelum Cleaning

ggplot(flights, aes(y = arr_delay)) +
 geom_boxplot(fill = "pink") +
  labs(title = "Boxplot Arrival Delay (Sebelum Cleaning)", y = "Arrival Delay (minutes)") +
  theme_light()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

5.2 Deteksi Outlier dengan IQR

Q1 <- quantile(flights$arr_delay, 0.25, na.rm = TRUE)
Q3 <- quantile(flights$arr_delay, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
  
flights %>%
  filter(arr_delay < lower | arr_delay > upper)
## # A tibble: 27,880 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1120            944        96     1331           1213
##  6  2013     1     1     1255           1200        55     1451           1330
##  7  2013     1     1     1301           1150        71     1518           1345
##  8  2013     1     1     1337           1220        77     1649           1531
##  9  2013     1     1     1342           1320        22     1617           1504
## 10  2013     1     1     1400           1250        70     1645           1502
## # ℹ 27,870 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

6. Cek Inkonsisten Logika

flights %>% 
  filter(!is.na(arr_time), !is.na(dep_time)) %>%
  filter(arr_time < dep_time)
## # A tibble: 10,633 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1929           1920         9        3              7
##  2  2013     1     1     1939           1840        59       29           2151
##  3  2013     1     1     2058           2100        -2        8           2359
##  4  2013     1     1     2102           2108        -6      146            158
##  5  2013     1     1     2108           2057        11       25             39
##  6  2013     1     1     2120           2130       -10       16             18
##  7  2013     1     1     2121           2040        41        6           2323
##  8  2013     1     1     2128           2135        -7       26             50
##  9  2013     1     1     2134           2045        49       20           2352
## 10  2013     1     1     2136           2145        -9       25             39
## # ℹ 10,623 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

7. Pembersihan Data

flights_cleaned <- flights %>%
  distinct() %>% 
  filter(!is.na(arr_delay),
         !is.na(dep_delay),
         arr_delay > lower,
         arr_delay < upper, 
         arr_delay < 1000)

8. Ringkasan Dataset flights Setelah Dibersihkan

summary(flights_cleaned)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 856   1st Qu.: 900  
##  Median :2013   Median : 7.000   Median :16.00   Median :1331   Median :1329  
##  Mean   :2013   Mean   : 6.583   Mean   :15.74   Mean   :1316   Mean   :1317  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1719   3rd Qu.:1710  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay         arr_time    sched_arr_time   arr_delay      
##  Min.   :-43.00   Min.   :   1   Min.   :   1   Min.   :-63.000  
##  1st Qu.: -5.00   1st Qu.:1102   1st Qu.:1112   1st Qu.:-18.000  
##  Median : -2.00   Median :1518   Median :1526   Median : -7.000  
##  Mean   :  3.28   Mean   :1498   Mean   :1513   Mean   : -3.686  
##  3rd Qu.:  5.00   3rd Qu.:1923   3rd Qu.:1927   3rd Qu.:  7.000  
##  Max.   :121.00   Max.   :2400   Max.   :2359   Max.   : 60.000  
##    carrier              flight       tailnum             origin         
##  Length:299466      Min.   :   1   Length:299466      Length:299466     
##  Class :character   1st Qu.: 537   Class :character   Class :character  
##  Mode  :character   Median :1454   Mode  :character   Mode  :character  
##                     Mean   :1913                                        
##                     3rd Qu.:3369                                        
##                     Max.   :6181                                        
##      dest              air_time        distance         hour      
##  Length:299466      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 83.0   1st Qu.: 509   1st Qu.: 9.00  
##  Mode  :character   Median :130.0   Median : 937   Median :13.00  
##                     Mean   :151.5   Mean   :1056   Mean   :12.91  
##                     3rd Qu.:193.0   3rd Qu.:1400   3rd Qu.:17.00  
##                     Max.   :691.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 06:00:00.00  
##  Median :29.00   Median :2013-07-06 08:00:00.00  
##  Mean   :26.12   Mean   :2013-07-04 07:44:50.56  
##  3rd Qu.:44.00   3rd Qu.:2013-10-03 14:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00
glimpse(flights_cleaned)
## Rows: 299,466
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
colSums(is.na(flights_cleaned))
##           year          month            day       dep_time sched_dep_time 
##              0              0              0              0              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##              0              0              0              0              0 
##         flight        tailnum         origin           dest       air_time 
##              0              0              0              0              0 
##       distance           hour         minute      time_hour 
##              0              0              0              0

9. Boxplot Setelah Dataset flights Dibersihkan

ggplot(flights_cleaned, aes(y=arr_delay))+
  geom_boxplot(fill = 'purple') +
  labs(title = "Boxplot dataset 'flights' Setelah Dibersihkan", y="Arrival Delay (minutes)") +
  theme_light()

10. Kesimpulan

Telah dilakukan perbemisah terhadap dataset flights dari paket nycflights13, berarti dataset telah siap untuk dianalisis. Hasilnya, ditemukan beberapa missing values dan nilai duplikat telah diatasi, lalu outlier ekstrem (pada kolom arr_delay) telah dihapus menggunakan metode IQR. Hasil visualisasi menggunakan boxplot menunjukkan bahwa dataset khususnya distribusi arr_delay setelah dibersihkan menjadi lebih bersih. Selain itu, fungsi summary() dan glimpse() menunjukkan bahwa dataset sudah tidak memiliki missing values dan memiliki struktur yang rapi.