Zoom Breakout Room No. 1: Bassa Bhaskara Desno Gabrihi (5052241030), Ghisele Valerin Sharent Milano (5052241005), Jason Alexander Widodo (5052241001), Gilbert Emanuel Sambira (5052241018)
Beberapa masalah umum yang ditemukan dalam dataset
flights:
dep_time,
arr_time, air_time, tailnumarr_delay dan
dep_delay, misalnya lebih dari 1000 menitair_time = 0 namun memiliki distance yang
jauh# 1. Buang baris dengan NA pada kolom penting
clean_flights <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time), !is.na(air_time))
# 2. Buang nilai delay ekstrem (lebih dari 1000 menit)
clean_flights <- clean_flights %>%
filter(arr_delay <= 1000, dep_delay <= 1000)
# 3. Buang baris dengan air_time = 0
clean_flights <- clean_flights %>%
filter(air_time > 0)
paste("Jumlah data sebelum dibersihkan:", nrow(flights))
## [1] "Jumlah data sebelum dibersihkan: 336776"
paste("Jumlah data setelah dibersihkan:", nrow(clean_flights))
## [1] "Jumlah data setelah dibersihkan: 327341"
summary(clean_flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1400 Median :1355
## Mean :2013 Mean : 6.565 Mean :15.74 Mean :1349 Mean :1340
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. :-43.00 Min. : 1 Min. : 1 Min. :-86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1122 1st Qu.:-17.000
## Median : -2.00 Median :1535 Median :1554 Median : -5.000
## Mean : 12.54 Mean :1502 Mean :1533 Mean : 6.879
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1944 3rd Qu.: 14.000
## Max. :960.00 Max. :2400 Max. :2359 Max. :931.000
## carrier flight tailnum origin
## Length:327341 Min. : 1 Length:327341 Length:327341
## Class :character 1st Qu.: 544 Class :character Class :character
## Mode :character Median :1467 Mode :character Mode :character
## Mean :1943
## 3rd Qu.:3412
## Max. :8500
## dest air_time distance hour
## Length:327341 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 888 Median :13.00
## Mean :150.7 Mean :1048 Mean :13.14
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 06:00:00.00
## Median :29.00 Median :2013-07-04 09:00:00.00
## Mean :26.23 Mean :2013-07-03 17:57:56.54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 18:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
clean_flights %>%
count(carrier, sort = TRUE) %>%
head(5)
## # A tibble: 5 × 2
## carrier n
## <chr> <int>
## 1 UA 57782
## 2 B6 54049
## 3 EV 51108
## 4 DL 47658
## 5 AA 31946
ggplot(clean_flights, aes(x = dep_delay)) +
geom_histogram(binwidth = 10, fill = "steelblue") +
xlim(-50, 300) +
labs(title = "Distribusi Delay Keberangkatan", x = "Delay (menit)", y = "Jumlah Penerbangan")
clean_flights %>%
group_by(carrier) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(carrier, avg_arr_delay), y = avg_arr_delay)) +
geom_col(fill = "tomato") +
labs(title = "Rata-rata Delay Kedatangan per Maskapai", x = "Maskapai", y = "Rata-rata Delay (menit)")