library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.2 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load data
data(flights)
#menampilkan 6 baris pertama
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#cek struktur
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
#cek ringkasan statistik
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
#cek dimensi
dim(flights)
## [1] 336776 19
# Hapus baris error: dep_time TIDAK NA tapi arr_time NA
flights_clean <- flights %>%
filter(!( !is.na(dep_time) & is.na(arr_time) )) # Hapus baris error (jumlahnya 458)
# Tambahkan kolom 'cancelled': jika dep_time dan arr_time sama-sama NA → Cancelled
flights_clean <- flights_clean %>%
mutate(cancelled = ifelse(is.na(dep_time) & is.na(arr_time), "Cancelled", "Not Cancelled"))
# Cek hasil
head(flights_clean) # Lihat baris awal
## # A tibble: 6 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, cancelled <chr>
table(flights_clean$cancelled) # Cek jumlah Cancelled vs Not Cancelled
##
## Cancelled Not Cancelled
## 8255 328063
# Cek berapa banyak data yang dihapus
original_rows <- nrow(flights)
cleaned_rows <- nrow(flights_clean)
removed_rows <- original_rows - cleaned_rows
removed_rows # Harusnya 458
## [1] 458
removed_rows / original_rows * 100 # Persentase data error yang dihapus
## [1] 0.1359954
library(lubridate)
# Fungsi untuk mengonversi waktu numerik (misalnya 2400, 59, 432) menjadi format waktu
convert_to_time <- function(time) {
hour <- floor(time / 100) # Ambil jam (bagian puluhan)
minute <- time %% 100 # Ambil menit (bagian satuan)
make_datetime(2013, 1, 1, hour, minute) # Gunakan tanggal dummy karena kita hanya butuh waktu
}
flights_clean <- flights_clean %>%
mutate(
dep_time_new = convert_to_time(dep_time),
sched_dep_time_new = convert_to_time(sched_dep_time),
arr_time_new = convert_to_time(arr_time),
sched_arr_time_new = convert_to_time(sched_arr_time)
)
#hapus kolom numerik ver-nya
flights_clean <- flights_clean %>%
select(-dep_time, -sched_dep_time, -arr_time, -sched_arr_time) %>%
rename(
dep_time = dep_time_new,
sched_dep_time = sched_dep_time_new,
arr_time = arr_time_new,
sched_arr_time = sched_arr_time_new
)
##4 Cek duplikasi
# Mengecek jumlah duplikat
sum(duplicated(flights_clean)) # Menghitung jumlah baris yang duplikat
## [1] 0
# Menampilkan baris yang duplikat
flights_clean[duplicated(flights_clean), ]
## # A tibble: 0 × 20
## # ℹ 20 variables: year <int>, month <int>, day <int>, dep_delay <dbl>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, cancelled <chr>, dep_time <dttm>, sched_dep_time <dttm>,
## # arr_time <dttm>, sched_arr_time <dttm>
# Mengecek duplikat berdasarkan kolom tertentu (misalnya flight dan tailnum)
sum(duplicated(flights_clean[, c("flight", "tailnum")]))
## [1] 156604
#5 cek outliers #outlier bertujuan untuk mendeteksi nilai ekstrem lalu treatment yang bisa kita lakukan adalah mencari tau apakah nilai ekstrem tersebut karena error atau memang nilai valid. kalo berdasarkan filter bisa diketahui data ini valid tapi jika dilihat mengunakan boxplot terdapat banyak outliers
# Cek delay (Departure & Arrival)
summary(flights_clean$dep_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -43.00 -5.00 -2.00 12.58 11.00 1301.00 8255
summary(flights_clean$arr_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -86.000 -17.000 -5.000 6.895 14.000 1272.000 8972
# Visualisasi boxplot untuk delay
par(mfrow = c(1, 2))
boxplot(flights_clean$dep_delay, main = "Departure Delay", horizontal = TRUE)
boxplot(flights_clean$arr_delay, main = "Arrival Delay", horizontal = TRUE)
# Filter delay sangat besar atau sangat negatif
flights_clean %>% filter(dep_delay < -30 | dep_delay > 600)
## # A tibble: 42 × 20
## year month day dep_delay arr_delay carrier flight tailnum origin dest
## <int> <int> <int> <dbl> <dbl> <chr> <int> <chr> <chr> <chr>
## 1 2013 1 1 853 851 MQ 3944 N942MQ JFK BWI
## 2 2013 1 9 1301 1272 HA 51 N384HA JFK HNL
## 3 2013 1 10 1126 1109 MQ 3695 N517MQ EWR ORD
## 4 2013 10 14 702 688 DL 502 N943DL EWR ATL
## 5 2013 11 3 798 796 DL 2042 N990AT EWR ATL
## 6 2013 11 10 -32 -10 EV 5713 N825AS LGA IAD
## 7 2013 11 24 636 614 AA 1697 N634AA JFK MIA
## 8 2013 12 1 687 681 DL 1091 N342NW JFK SAT
## 9 2013 12 5 896 878 AA 172 N5DMAA EWR MIA
## 10 2013 12 7 -43 48 B6 97 N592JB JFK DEN
## # ℹ 32 more rows
## # ℹ 10 more variables: air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, cancelled <chr>, dep_time <dttm>,
## # sched_dep_time <dttm>, arr_time <dttm>, sched_arr_time <dttm>
# Tandai outliers dengan kolom baru (treatment)
flights_clean <- flights_clean %>%
mutate(outlier_dep_delay = dep_delay < -30 | dep_delay > 600)
# Summary untuk penerbangan yang termasuk outliers dengan yang bukan outliers
summary(flights_clean$dep_delay[flights_clean$outlier_dep_delay == TRUE])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -43.0 689.2 797.5 760.3 871.8 1301.0 8255
summary(flights_clean$dep_delay[flights_clean$outlier_dep_delay == FALSE])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -30.00 -5.00 -2.00 12.48 11.00 599.00 8255
# Cek air_time dan distance yang ekstrem
summary(flights_clean$air_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20.0 82.0 129.0 150.7 192.0 695.0 8972
summary(flights_clean$distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17 502 872 1040 1389 4983
# Filter air_time dan distance yang tidak masuk akal
flights_clean %>%
filter((air_time <= 0 & distance > 0) | air_time > 1000)
## # A tibble: 0 × 21
## # ℹ 21 variables: year <int>, month <int>, day <int>, dep_delay <dbl>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, cancelled <chr>, dep_time <dttm>, sched_dep_time <dttm>,
## # arr_time <dttm>, sched_arr_time <dttm>, outlier_dep_delay <lgl>
# Visualisasi boxplot untuk air_time dan distance
boxplot(flights_clean$air_time, main = "Air Time", horizontal = TRUE, col = "lightblue")
boxplot(flights_clean$distance, main = "Distance", horizontal = TRUE, col = "lightgreen")
## 6. Cek Anomali Lainnya dan Data Tidak Masuk Akal
# Penerbangan yang tidak cancelled dan tidak error tapi air_time dan arr_delay tetap NA
sisa_na <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time),
is.na(air_time), is.na(arr_delay))
nrow(sisa_na)
## [1] 717
head(sisa_na)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1525 1530 -5 1934 1805
## 2 2013 1 1 1528 1459 29 2002 1647
## 3 2013 1 1 1740 1745 -5 2158 2020
## 4 2013 1 1 1807 1738 29 2251 2103
## 5 2013 1 1 1939 1840 59 29 2151
## 6 2013 1 1 1952 1930 22 2358 2207
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Arrival time < departure time (mungkin karena lewat tengah malam)
flights %>%
filter(!is.na(dep_time), !is.na(arr_time), arr_time < dep_time)
## # A tibble: 10,633 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1929 1920 9 3 7
## 2 2013 1 1 1939 1840 59 29 2151
## 3 2013 1 1 2058 2100 -2 8 2359
## 4 2013 1 1 2102 2108 -6 146 158
## 5 2013 1 1 2108 2057 11 25 39
## 6 2013 1 1 2120 2130 -10 16 18
## 7 2013 1 1 2121 2040 41 6 2323
## 8 2013 1 1 2128 2135 -7 26 50
## 9 2013 1 1 2134 2045 49 20 2352
## 10 2013 1 1 2136 2145 -9 25 39
## # ℹ 10,623 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Air time ≤ 0 (tidak mungkin)
flights %>%
filter(!is.na(air_time) & air_time <= 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Delay tak masuk akal: dep_time - sched_dep_time tidak sesuai dep_delay
flights %>%
mutate(calc_dep_delay = dep_time - sched_dep_time) %>%
filter(abs(calc_dep_delay - dep_delay) > 5) # Toleransi ±5 menit
## # A tibble: 99,777 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 554 600 -6 812 837
## 2 2013 1 1 555 600 -5 913 854
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 753 745
## 6 2013 1 1 558 600 -2 849 851
## 7 2013 1 1 558 600 -2 853 856
## 8 2013 1 1 558 600 -2 924 917
## 9 2013 1 1 558 600 -2 923 937
## 10 2013 1 1 559 600 -1 941 910
## # ℹ 99,767 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, calc_dep_delay <int>
# Delay ekstrim
flights %>% filter(dep_delay > 1000 | arr_delay > 1000)
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 1 10 1121 1635 1126 1239 1810
## 3 2013 6 15 1432 1935 1137 1607 2120
## 4 2013 7 22 845 1600 1005 1044 1815
## 5 2013 9 20 1139 1845 1014 1457 2210
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Hitung kecepatan pesawat
flights %>%
filter(!is.na(air_time), !is.na(distance)) %>%
mutate(speed = distance / (air_time / 60)) %>%
arrange(desc(speed)) %>%
head()
## # A tibble: 6 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 5 25 1709 1700 9 1923 1937
## 2 2013 7 2 1558 1513 45 1745 1719
## 3 2013 5 13 2040 2025 15 2225 2226
## 4 2013 3 23 1914 1910 4 2045 2043
## 5 2013 1 12 1559 1600 -1 1849 1917
## 6 2013 11 17 650 655 -5 1059 1150
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>
original_rows <- nrow(flights)
cleaned_rows <- nrow(flights_clean)
removed_rows <- original_rows - cleaned_rows
removed_rows
## [1] 458
removed_rows / original_rows * 100 # Persentase
## [1] 0.1359954