Dalam tugas kali ini, kami menggunakan dataset flights
dari paket nycflights13. Dataset ini berisikan informasi
mengenai penerbangan dari 3 bandara utama di NYC pada tahun 2013. Tujuan
dari laporan ini adalah untuk mengidentifikasi masalah yang ada pada
dataset dan membersihkannya.
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
flights dan menampilkan struktur
dan nilai dari flightsdata("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 8255 0 8255 8713 0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## # tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## # hour <int>, minute <int>, time_hour <int>
flights %>%
duplicated() %>%
sum()
## [1] 0
flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
ggplot(flights, aes(y = arr_delay)) +
geom_boxplot(fill = "pink") +
labs(title = "Boxplot Arrival Delay (Sebelum Cleaning)", y = "Arrival Delay (minutes)") +
theme_light()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Q1 <- quantile(flights$arr_delay, 0.25, na.rm = TRUE)
Q3 <- quantile(flights$arr_delay, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
flights %>%
filter(arr_delay < lower | arr_delay > upper)
## # A tibble: 27,880 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1120 944 96 1331 1213
## 6 2013 1 1 1255 1200 55 1451 1330
## 7 2013 1 1 1301 1150 71 1518 1345
## 8 2013 1 1 1337 1220 77 1649 1531
## 9 2013 1 1 1342 1320 22 1617 1504
## 10 2013 1 1 1400 1250 70 1645 1502
## # ℹ 27,870 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(!is.na(arr_time), !is.na(dep_time)) %>%
filter(arr_time < dep_time)
## # A tibble: 10,633 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1929 1920 9 3 7
## 2 2013 1 1 1939 1840 59 29 2151
## 3 2013 1 1 2058 2100 -2 8 2359
## 4 2013 1 1 2102 2108 -6 146 158
## 5 2013 1 1 2108 2057 11 25 39
## 6 2013 1 1 2120 2130 -10 16 18
## 7 2013 1 1 2121 2040 41 6 2323
## 8 2013 1 1 2128 2135 -7 26 50
## 9 2013 1 1 2134 2045 49 20 2352
## 10 2013 1 1 2136 2145 -9 25 39
## # ℹ 10,623 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights_cleaned <- flights %>%
distinct() %>%
filter(!is.na(arr_delay),
!is.na(dep_delay),
arr_delay > lower,
arr_delay < upper,
arr_delay < 1000)
flights Setelah Dibersihkansummary(flights_cleaned)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 856 1st Qu.: 900
## Median :2013 Median : 7.000 Median :16.00 Median :1331 Median :1329
## Mean :2013 Mean : 6.583 Mean :15.74 Mean :1316 Mean :1317
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1719 3rd Qu.:1710
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. :-43.00 Min. : 1 Min. : 1 Min. :-63.000
## 1st Qu.: -5.00 1st Qu.:1102 1st Qu.:1112 1st Qu.:-18.000
## Median : -2.00 Median :1518 Median :1526 Median : -7.000
## Mean : 3.28 Mean :1498 Mean :1513 Mean : -3.686
## 3rd Qu.: 5.00 3rd Qu.:1923 3rd Qu.:1927 3rd Qu.: 7.000
## Max. :121.00 Max. :2400 Max. :2359 Max. : 60.000
## carrier flight tailnum origin
## Length:299466 Min. : 1 Length:299466 Length:299466
## Class :character 1st Qu.: 537 Class :character Class :character
## Mode :character Median :1454 Mode :character Mode :character
## Mean :1913
## 3rd Qu.:3369
## Max. :6181
## dest air_time distance hour
## Length:299466 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 83.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :130.0 Median : 937 Median :13.00
## Mean :151.5 Mean :1056 Mean :12.91
## 3rd Qu.:193.0 3rd Qu.:1400 3rd Qu.:17.00
## Max. :691.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 06:00:00.00
## Median :29.00 Median :2013-07-06 08:00:00.00
## Mean :26.12 Mean :2013-07-04 07:44:50.56
## 3rd Qu.:44.00 3rd Qu.:2013-10-03 14:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
glimpse(flights_cleaned)
## Rows: 299,466
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
colSums(is.na(flights_cleaned))
## year month day dep_time sched_dep_time
## 0 0 0 0 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 0 0 0 0 0
## flight tailnum origin dest air_time
## 0 0 0 0 0
## distance hour minute time_hour
## 0 0 0 0
flights Dibersihkanggplot(flights_cleaned, aes(y=arr_delay))+
geom_boxplot(fill = 'purple') +
labs(title = "Boxplot dataset 'flights' Setelah Dibersihkan", y="Arrival Delay (minutes)") +
theme_light()
Telah dilakukan perbemisah terhadap dataset flights dari
paket nycflights13, berarti dataset telah siap untuk
dianalisis. Hasilnya, ditemukan beberapa missing values dan nilai
duplikat telah diatasi, lalu outlier ekstrem (pada kolom
arr_delay) telah dihapus menggunakan metode IQR. Hasil
visualisasi menggunakan boxplot menunjukkan bahwa dataset khususnya
distribusi arr_delay setelah dibersihkan menjadi lebih
bersih. Selain itu, fungsi summary() dan
glimpse() menunjukkan bahwa dataset sudah tidak memiliki
missing values dan memiliki struktur yang rapi.