1. Memuat Paket dan membaca data

library(nycflights13)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load data
data(flights)

#menampilkan 6 baris pertama
head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#cek struktur
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
#cek ringkasan statistik
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
#cek dimensi
dim(flights)
## [1] 336776     19

2. cek missing data

# Hapus baris error: dep_time TIDAK NA tapi arr_time NA
flights_clean <- flights %>%
  filter(!( !is.na(dep_time) & is.na(arr_time) ))  # Hapus baris error (jumlahnya 458)

# Tambahkan kolom 'cancelled': jika dep_time dan arr_time sama-sama NA → Cancelled
flights_clean <- flights_clean %>%
  mutate(cancelled = ifelse(is.na(dep_time) & is.na(arr_time), "Cancelled", "Not Cancelled"))

# Cek hasil
head(flights_clean)  # Lihat baris awal
## # A tibble: 6 × 20
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, cancelled <chr>
table(flights_clean$cancelled)  # Cek jumlah Cancelled vs Not Cancelled
## 
##     Cancelled Not Cancelled 
##          8255        328063
# Cek berapa banyak data yang dihapus
original_rows <- nrow(flights)
cleaned_rows <- nrow(flights_clean)
removed_rows <- original_rows - cleaned_rows
removed_rows  # Harusnya 458
## [1] 458
removed_rows / original_rows * 100  # Persentase data error yang dihapus
## [1] 0.1359954

3. Ubah format time

library(lubridate)

# Fungsi untuk mengonversi waktu numerik (misalnya 2400, 59, 432) menjadi format waktu
convert_to_time <- function(time) {
  hour <- floor(time / 100)  # Ambil jam (bagian puluhan)
  minute <- time %% 100      # Ambil menit (bagian satuan)
  make_datetime(2013, 1, 1, hour, minute)  # Gunakan tanggal dummy karena kita hanya butuh waktu
}

flights_clean <- flights_clean %>%
  mutate(
    dep_time_new = convert_to_time(dep_time),
    sched_dep_time_new = convert_to_time(sched_dep_time),
    arr_time_new = convert_to_time(arr_time),
    sched_arr_time_new = convert_to_time(sched_arr_time)
  )
#hapus kolom numerik ver-nya
flights_clean <- flights_clean %>%
  select(-dep_time, -sched_dep_time, -arr_time, -sched_arr_time) %>%
  rename(
    dep_time = dep_time_new,
    sched_dep_time = sched_dep_time_new,
    arr_time = arr_time_new,
    sched_arr_time = sched_arr_time_new
  )

##4 Cek duplikasi

# Mengecek jumlah duplikat
sum(duplicated(flights_clean))  # Menghitung jumlah baris yang duplikat
## [1] 0
# Menampilkan baris yang duplikat
flights_clean[duplicated(flights_clean), ]
## # A tibble: 0 × 20
## # ℹ 20 variables: year <int>, month <int>, day <int>, dep_delay <dbl>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, cancelled <chr>, dep_time <dttm>, sched_dep_time <dttm>,
## #   arr_time <dttm>, sched_arr_time <dttm>
# Mengecek duplikat berdasarkan kolom tertentu (misalnya flight dan tailnum)
sum(duplicated(flights_clean[, c("flight", "tailnum")]))
## [1] 156604

#5 cek outliers #outlier bertujuan untuk mendeteksi nilai ekstrem lalu treatment yang bisa kita lakukan adalah mencari tau apakah nilai ekstrem tersebut karena error atau memang nilai valid. kalo berdasarkan filter bisa diketahui data ini valid tapi jika dilihat mengunakan boxplot terdapat banyak outliers

# Cek delay (Departure & Arrival)
summary(flights_clean$dep_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -43.00   -5.00   -2.00   12.58   11.00 1301.00    8255
summary(flights_clean$arr_delay)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -86.000  -17.000   -5.000    6.895   14.000 1272.000     8972
# Visualisasi boxplot untuk delay
par(mfrow = c(1, 2))
boxplot(flights_clean$dep_delay, main = "Departure Delay", horizontal = TRUE)
boxplot(flights_clean$arr_delay, main = "Arrival Delay", horizontal = TRUE)

# Filter delay sangat besar atau sangat negatif
flights_clean %>% filter(dep_delay < -30 | dep_delay > 600)
## # A tibble: 42 × 20
##     year month   day dep_delay arr_delay carrier flight tailnum origin dest 
##    <int> <int> <int>     <dbl>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
##  1  2013     1     1       853       851 MQ        3944 N942MQ  JFK    BWI  
##  2  2013     1     9      1301      1272 HA          51 N384HA  JFK    HNL  
##  3  2013     1    10      1126      1109 MQ        3695 N517MQ  EWR    ORD  
##  4  2013    10    14       702       688 DL         502 N943DL  EWR    ATL  
##  5  2013    11     3       798       796 DL        2042 N990AT  EWR    ATL  
##  6  2013    11    10       -32       -10 EV        5713 N825AS  LGA    IAD  
##  7  2013    11    24       636       614 AA        1697 N634AA  JFK    MIA  
##  8  2013    12     1       687       681 DL        1091 N342NW  JFK    SAT  
##  9  2013    12     5       896       878 AA         172 N5DMAA  EWR    MIA  
## 10  2013    12     7       -43        48 B6          97 N592JB  JFK    DEN  
## # ℹ 32 more rows
## # ℹ 10 more variables: air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, cancelled <chr>, dep_time <dttm>,
## #   sched_dep_time <dttm>, arr_time <dttm>, sched_arr_time <dttm>
# Tandai outliers dengan kolom baru (treatment)
flights_clean <- flights_clean %>%
  mutate(outlier_dep_delay = dep_delay < -30 | dep_delay > 600)

# Summary untuk penerbangan yang termasuk outliers dengan yang bukan outliers
summary(flights_clean$dep_delay[flights_clean$outlier_dep_delay == TRUE])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -43.0   689.2   797.5   760.3   871.8  1301.0    8255
summary(flights_clean$dep_delay[flights_clean$outlier_dep_delay == FALSE])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -30.00   -5.00   -2.00   12.48   11.00  599.00    8255
# Cek air_time dan distance yang ekstrem
summary(flights_clean$air_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    20.0    82.0   129.0   150.7   192.0   695.0    8972
summary(flights_clean$distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      17     502     872    1040    1389    4983
# Filter air_time dan distance yang tidak masuk akal
flights_clean %>%
  filter((air_time <= 0 & distance > 0) | air_time > 1000)
## # A tibble: 0 × 21
## # ℹ 21 variables: year <int>, month <int>, day <int>, dep_delay <dbl>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, cancelled <chr>, dep_time <dttm>, sched_dep_time <dttm>,
## #   arr_time <dttm>, sched_arr_time <dttm>, outlier_dep_delay <lgl>
# Visualisasi boxplot untuk air_time dan distance
boxplot(flights_clean$air_time, main = "Air Time", horizontal = TRUE, col = "lightblue")
boxplot(flights_clean$distance, main = "Distance", horizontal = TRUE, col = "lightgreen")

## 6. Cek Anomali Lainnya dan Data Tidak Masuk Akal

# Penerbangan yang tidak cancelled dan tidak error tapi air_time dan arr_delay tetap NA
sisa_na <- flights %>%
  filter(!is.na(dep_time), !is.na(arr_time),
         is.na(air_time), is.na(arr_delay))

nrow(sisa_na)
## [1] 717
head(sisa_na)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1     1525           1530        -5     1934           1805
## 2  2013     1     1     1528           1459        29     2002           1647
## 3  2013     1     1     1740           1745        -5     2158           2020
## 4  2013     1     1     1807           1738        29     2251           2103
## 5  2013     1     1     1939           1840        59       29           2151
## 6  2013     1     1     1952           1930        22     2358           2207
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Arrival time < departure time (mungkin karena lewat tengah malam)
flights %>%
  filter(!is.na(dep_time), !is.na(arr_time), arr_time < dep_time)
## # A tibble: 10,633 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1929           1920         9        3              7
##  2  2013     1     1     1939           1840        59       29           2151
##  3  2013     1     1     2058           2100        -2        8           2359
##  4  2013     1     1     2102           2108        -6      146            158
##  5  2013     1     1     2108           2057        11       25             39
##  6  2013     1     1     2120           2130       -10       16             18
##  7  2013     1     1     2121           2040        41        6           2323
##  8  2013     1     1     2128           2135        -7       26             50
##  9  2013     1     1     2134           2045        49       20           2352
## 10  2013     1     1     2136           2145        -9       25             39
## # ℹ 10,623 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Air time ≤ 0 (tidak mungkin)
flights %>%
  filter(!is.na(air_time) & air_time <= 0)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Delay tak masuk akal: dep_time - sched_dep_time tidak sesuai dep_delay
flights %>%
  mutate(calc_dep_delay = dep_time - sched_dep_time) %>%
  filter(abs(calc_dep_delay - dep_delay) > 5)  # Toleransi ±5 menit
## # A tibble: 99,777 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      554            600        -6      812            837
##  2  2013     1     1      555            600        -5      913            854
##  3  2013     1     1      557            600        -3      709            723
##  4  2013     1     1      557            600        -3      838            846
##  5  2013     1     1      558            600        -2      753            745
##  6  2013     1     1      558            600        -2      849            851
##  7  2013     1     1      558            600        -2      853            856
##  8  2013     1     1      558            600        -2      924            917
##  9  2013     1     1      558            600        -2      923            937
## 10  2013     1     1      559            600        -1      941            910
## # ℹ 99,767 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, calc_dep_delay <int>
# Delay ekstrim
flights %>% filter(dep_delay > 1000 | arr_delay > 1000)
## # A tibble: 5 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## 2  2013     1    10     1121           1635      1126     1239           1810
## 3  2013     6    15     1432           1935      1137     1607           2120
## 4  2013     7    22      845           1600      1005     1044           1815
## 5  2013     9    20     1139           1845      1014     1457           2210
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

7. Cek Kecepatan Ekstrem

# Hitung kecepatan pesawat
flights %>%
  filter(!is.na(air_time), !is.na(distance)) %>%
  mutate(speed = distance / (air_time / 60)) %>%
  arrange(desc(speed)) %>%
  head()
## # A tibble: 6 × 20
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     5    25     1709           1700         9     1923           1937
## 2  2013     7     2     1558           1513        45     1745           1719
## 3  2013     5    13     2040           2025        15     2225           2226
## 4  2013     3    23     1914           1910         4     2045           2043
## 5  2013     1    12     1559           1600        -1     1849           1917
## 6  2013    11    17      650            655        -5     1059           1150
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>

8. Hitung Jumlah dan Persentase Data yang Dihapus

original_rows <- nrow(flights)
cleaned_rows <- nrow(flights_clean)
removed_rows <- original_rows - cleaned_rows
removed_rows
## [1] 458
removed_rows / original_rows * 100  # Persentase
## [1] 0.1359954