Zoom Breakout Room No. 1: Bassa Bhaskara Desno Gabrihi (5052241030), Ghisele Valerin Sharent Milano (5052241005), Jason Alexander Widodo (5052241001), Gilbert Emanuel Sambira (5052241018)

Dataset flights

1. Laporan Singkat

Masalah Data yang Ditemukan

Beberapa masalah umum yang ditemukan dalam dataset flights:

  • Missing values pada kolom dep_time, arr_time, air_time, tailnum
  • Nilai ekstrem pada arr_delay dan dep_delay, misalnya lebih dari 1000 menit
  • Nilai tidak logis, seperti air_time = 0 namun memiliki distance yang jauh

Cara Memperbaiki Masalah

# 1. Buang baris dengan NA pada kolom penting
clean_flights <- flights %>%
  filter(!is.na(dep_time), !is.na(arr_time), !is.na(air_time))

# 2. Buang nilai delay ekstrem (lebih dari 1000 menit)
clean_flights <- clean_flights %>%
  filter(arr_delay <= 1000, dep_delay <= 1000)

# 3. Buang baris dengan air_time = 0
clean_flights <- clean_flights %>%
  filter(air_time > 0)

2. Ringkasan Dataset yang Sudah Dibersihkan

Jumlah Observasi

paste("Jumlah data sebelum dibersihkan:", nrow(flights))
## [1] "Jumlah data sebelum dibersihkan: 336776"
paste("Jumlah data setelah dibersihkan:", nrow(clean_flights))
## [1] "Jumlah data setelah dibersihkan: 327341"

Statistik Umum

summary(clean_flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 905  
##  Median :2013   Median : 7.000   Median :16.00   Median :1400   Median :1355  
##  Mean   :2013   Mean   : 6.565   Mean   :15.74   Mean   :1349   Mean   :1340  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay         arr_time    sched_arr_time   arr_delay      
##  Min.   :-43.00   Min.   :   1   Min.   :   1   Min.   :-86.000  
##  1st Qu.: -5.00   1st Qu.:1104   1st Qu.:1122   1st Qu.:-17.000  
##  Median : -2.00   Median :1535   Median :1554   Median : -5.000  
##  Mean   : 12.54   Mean   :1502   Mean   :1533   Mean   :  6.879  
##  3rd Qu.: 11.00   3rd Qu.:1940   3rd Qu.:1944   3rd Qu.: 14.000  
##  Max.   :960.00   Max.   :2400   Max.   :2359   Max.   :931.000  
##    carrier              flight       tailnum             origin         
##  Length:327341      Min.   :   1   Length:327341      Length:327341     
##  Class :character   1st Qu.: 544   Class :character   Class :character  
##  Mode  :character   Median :1467   Mode  :character   Mode  :character  
##                     Mean   :1943                                        
##                     3rd Qu.:3412                                        
##                     Max.   :8500                                        
##      dest              air_time        distance         hour      
##  Length:327341      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 509   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 888   Median :13.00  
##                     Mean   :150.7   Mean   :1048   Mean   :13.14  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-05 06:00:00.00  
##  Median :29.00   Median :2013-07-04 09:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 17:57:56.54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 18:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00

5 Maskapai dengan Penerbangan Terbanyak

clean_flights %>%
  count(carrier, sort = TRUE) %>%
  head(5)
## # A tibble: 5 × 2
##   carrier     n
##   <chr>   <int>
## 1 UA      57782
## 2 B6      54049
## 3 EV      51108
## 4 DL      47658
## 5 AA      31946

Visualisasi

Distribusi Delay Keberangkatan

ggplot(clean_flights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 10, fill = "steelblue") +
  xlim(-50, 300) +
  labs(title = "Distribusi Delay Keberangkatan", x = "Delay (menit)", y = "Jumlah Penerbangan")

Rata-rata Delay Kedatangan per Maskapai

clean_flights %>%
  group_by(carrier) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(carrier, avg_arr_delay), y = avg_arr_delay)) +
  geom_col(fill = "tomato") +
  labs(title = "Rata-rata Delay Kedatangan per Maskapai", x = "Maskapai", y = "Rata-rata Delay (menit)")