Anggota Kelompok:
#data flights
library(nycflights13)
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 8255 0 8255 8713 0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## # tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## # hour <int>, minute <int>, time_hour <int>
Data yang memiliki missing values: dep_time,
dep_delay, arr_time, arr_delay,
tailnum, dan air_time.
flights %>%
duplicated() %>%
sum()
## [1] 0
Tidak terdapat data yang terduplikasi
Mendeteksi outliers pada data bertipe numerik
ggplot(flights, aes(y = dep_delay)) +
geom_boxplot()
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Terdapat outliers pada data
dep_delay. Namun, nilai-nilai
ekstrem pada dep_delay tetap relevan dengan kondisi dalam
dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data
tetap dipertahankan.
ggplot(flights, aes(y = arr_delay)) +
geom_boxplot()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Terdapat outliers pada data
arr_delay. Namun, nilai-nilai
ekstrem pada arr_delay tetap relevan dengan kondisi dalam
dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data
tetap dipertahankan.
ggplot(flights, aes(y = air_time)) +
geom_boxplot()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Terdapat outliers pada data
air_time. Namun, nilai-nilai
ekstrem pada air_time tetap relevan dengan kondisi dalam
dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data
tetap dipertahankan.
ggplot(flights, aes(y = distance)) +
geom_boxplot()
Terdapat outliers pada data
distance. Namun, nilai-nilai
ekstrem pada distance tetap relevan dengan kondisi dalam
dunia nyata dan tidak menunjukkan indikasi kesalahan input sehingga data
tetap dipertahankan.
ggplot(flights, aes(y = hour)) +
geom_boxplot()
Tidak terdapat outlier pada data
hour.
ggplot(flights, aes(y = minute)) +
geom_boxplot()
Tidak terdapat outlier pada data
minute,
Mendeteksi inconsistens categories pada data bertipe string
flights %>% count(carrier)
## # A tibble: 16 × 2
## carrier n
## <chr> <int>
## 1 9E 18460
## 2 AA 32729
## 3 AS 714
## 4 B6 54635
## 5 DL 48110
## 6 EV 54173
## 7 F9 685
## 8 FL 3260
## 9 HA 342
## 10 MQ 26397
## 11 OO 32
## 12 UA 58665
## 13 US 20536
## 14 VX 5162
## 15 WN 12275
## 16 YV 601
Tidak terdapat inconsistent category pada data
carrier.
flights %>% count(tailnum)
## # A tibble: 4,044 × 2
## tailnum n
## <chr> <int>
## 1 D942DN 4
## 2 N0EGMQ 371
## 3 N10156 153
## 4 N102UW 48
## 5 N103US 46
## 6 N104UW 47
## 7 N10575 289
## 8 N105UW 45
## 9 N107US 41
## 10 N108UW 60
## # ℹ 4,034 more rows
Tidak terdapat inconsistent category pada data
tailnum.
flights %>% count(origin)
## # A tibble: 3 × 2
## origin n
## <chr> <int>
## 1 EWR 120835
## 2 JFK 111279
## 3 LGA 104662
Tidak terdapat inconsistent category pada data
origin.
flights %>% count(dest)
## # A tibble: 105 × 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 265
## 3 ALB 439
## 4 ANC 8
## 5 ATL 17215
## 6 AUS 2439
## 7 AVL 275
## 8 BDL 443
## 9 BGR 375
## 10 BHM 297
## # ℹ 95 more rows
Tidak terdapat inconsistent category pada data dest.
flights %>% count(time_hour)
## # A tibble: 6,936 × 2
## time_hour n
## <dttm> <int>
## 1 2013-01-01 05:00:00 6
## 2 2013-01-01 06:00:00 52
## 3 2013-01-01 07:00:00 49
## 4 2013-01-01 08:00:00 58
## 5 2013-01-01 09:00:00 56
## 6 2013-01-01 10:00:00 39
## 7 2013-01-01 11:00:00 37
## 8 2013-01-01 12:00:00 56
## 9 2013-01-01 13:00:00 54
## 10 2013-01-01 14:00:00 48
## # ℹ 6,926 more rows
Tidak terdapat inconsistent category pada data
time_hour.
flights %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integer integ… inte… integer integer numeric integer integer
## 2 integer integ… inte… integer integer numeric integer integer
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## # hour <chr>, minute <chr>, time_hour <chr>
Terdapat incorrect data types pada data flights yang
seharusnya bertipe data character karena data flights
merupakan kode identifikasi pesawat dan bukan merupakan angka untuk
dihitung.
flights %>% filter(distance < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Tidak terdapat logical inconsistencies data distance
flights %>% filter(air_time < 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Tidak terdapat logical inconsistencies data air_time
Menghapus data yang memiliki missing value
flights_cleaned <- flights %>%
drop_na() %>%
mutate(flight = as.character(flight))
summary(flights_cleaned)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1400 Median :1355
## Mean :2013 Mean : 6.565 Mean :15.74 Mean :1349 Mean :1340
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1122 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1554 Median : -5.000
## Mean : 12.56 Mean :1502 Mean :1533 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1944 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## carrier flight tailnum origin
## Length:327346 Length:327346 Length:327346 Length:327346
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## dest air_time distance hour
## Length:327346 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 888 Median :13.00
## Mean :150.7 Mean :1048 Mean :13.14
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 06:00:00
## Median :29.00 Median :2013-07-04 09:00:00
## Mean :26.23 Mean :2013-07-03 17:56:45
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 18:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00