data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 8255 0 8255 8713 0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## # tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## # hour <int>, minute <int>, time_hour <int>
flights %>%
filter(is.na(dep_time))
## # A tibble: 8,255 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## # ℹ 8,245 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
terdapat banyak data yang merupakan NA salah satunya dep_time yang memiliki 8255 nilai NA
flights = flights %>%
drop_na()
flights %>%
duplicated() %>%
sum()
## [1] 0
flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
tidak ada data yang duplikat
ggplot(flights, aes(x = origin, y = distance, fill = origin)) +
geom_boxplot(outlier.color = 'purple')
terlihat beberapa outlier pada variabel distance di origin EWR dan JFK
flights_JFK = flights %>%
filter(origin == "JFK")
Q1 = quantile(flights_JFK$distance, 0.25)
Q3 = quantile(flights_JFK$distance, 0.75)
IQR = Q3 - Q1
outlier_JFK = flights_JFK %>%
filter(distance < (Q1 - 1.5 * IQR) |
distance > (Q3 + 1.5 * IQR))
outlier_JFK
## # A tibble: 342 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 857 900 -3 1516 1530
## 2 2013 1 2 909 900 9 1525 1530
## 3 2013 1 3 914 900 14 1504 1530
## 4 2013 1 4 900 900 0 1516 1530
## 5 2013 1 5 858 900 -2 1519 1530
## 6 2013 1 6 1019 900 79 1558 1530
## 7 2013 1 7 1042 900 102 1620 1530
## 8 2013 1 8 901 900 1 1504 1530
## 9 2013 1 9 641 900 1301 1242 1530
## 10 2013 1 10 859 900 -1 1449 1530
## # ℹ 332 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Ternyata jarak dari JFK ke HNL memang jauh sehingga dibiarkan
flights_EWR = flights %>%
filter(origin == "EWR")
Q1 = quantile(flights_EWR$distance, 0.25)
Q3 = quantile(flights_EWR$distance, 0.75)
IQR = Q3 - Q1
outlier_EWR = flights_EWR %>%
filter(distance < (Q1 - 1.5 * IQR) |
distance > (Q3 + 1.5 * IQR))
outlier_EWR
## # A tibble: 367 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1344 1344 0 2005 1944
## 2 2013 1 2 1344 1344 0 1940 1944
## 3 2013 1 3 1418 1341 37 2006 1935
## 4 2013 1 4 1343 1341 2 1932 1935
## 5 2013 1 5 1329 1335 -6 1850 1935
## 6 2013 1 6 1353 1341 12 1927 1941
## 7 2013 1 7 1340 1341 -1 1925 1935
## 8 2013 1 8 1344 1341 3 1951 1935
## 9 2013 1 9 1340 1341 -1 2019 1935
## 10 2013 1 10 1342 1341 1 1935 1935
## # ℹ 357 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Ternyata jarak dari EWR ke HNL dan ANC memang jauh sehingga dibiarkan
flights_LGA = flights %>%
filter(origin == "LGA")
Q1 = quantile(flights_LGA$distance, 0.25)
Q3 = quantile(flights_LGA$distance, 0.75)
IQR = Q3 - Q1
outlier_LGA = flights_LGA %>%
filter(distance < (Q1 - 1.5 * IQR) |
distance > (Q3 + 1.5 * IQR))
outlier_LGA
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Di LGA tidak ada outlier
flights %>% count(origin)
## # A tibble: 3 × 2
## origin n
## <chr> <int>
## 1 EWR 117127
## 2 JFK 109079
## 3 LGA 101140
flights %>% count(dest)
## # A tibble: 104 × 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ℹ 94 more rows
Semua merupakan kode 3-huruf bandara, konsisten
flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integer integ… inte… integer integer numeric integer integer
## 2 integer integ… inte… integer integer numeric integer integer
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## # hour <chr>, minute <chr>, time_hour <chr>
setelah membaca di internet ternyata flight merupakan kode seri dari pesawat, sehingga seharusnya diganti dari integer menjadi character
flights <- flights %>%
mutate(flight = as.character(flight))
flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integer integ… inte… integer integer numeric integer integer
## 2 integer integ… inte… integer integer numeric integer integer
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## # hour <chr>, minute <chr>, time_hour <chr>
flights %>%
filter(distance < 0 | air_time < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <chr>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
di distance dan air_time tidak ada data yang bernilai negatif, sehingga datanya masuk akal
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1400 Median :1355
## Mean :2013 Mean : 6.565 Mean :15.74 Mean :1349 Mean :1340
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1122 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1554 Median : -5.000
## Mean : 12.56 Mean :1502 Mean :1533 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1944 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## carrier flight tailnum origin
## Length:327346 Length:327346 Length:327346 Length:327346
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## dest air_time distance hour
## Length:327346 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 888 Median :13.00
## Mean :150.7 Mean :1048 Mean :13.14
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 06:00:00.00
## Median :29.00 Median :2013-07-04 09:00:00.00
## Mean :26.23 Mean :2013-07-03 17:56:45.44
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 18:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00