-Gilbert Sambira (5052241018)
-Jason Alexander (5052241001)
-Ghisele Valerin (5052241005)
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
Pertama kita mulai dengan melihat data awal untuk di clean.
data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
Setelah dilihat maka kita mulai dengan mengecek nilai NA pada tiap kolomnya, dengan menggunakan colSums kita mendapat bahwa pada kolom dep_time,arr_time,air_time,dan distance memiliki nilai NA, maka gunakan filter(!is.na) untuk menghilangkan nilai NA nya
colSums(is.na(flights))
## year month day dep_time sched_dep_time
## 0 0 0 8255 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 8255 8713 0 9430 0
## flight tailnum origin dest air_time
## 0 2512 0 0 9430
## distance hour minute time_hour
## 0 0 0 0
flights_clean <- flights %>%
filter(!is.na(dep_time),
!is.na(arr_time),
!is.na(air_time),
!is.na(distance))
glimpse(flights_clean)
## Rows: 327,346
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
colSums(is.na(flights_clean))
## year month day dep_time sched_dep_time
## 0 0 0 0 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 0 0 0 0 0
## flight tailnum origin dest air_time
## 0 0 0 0 0
## distance hour minute time_hour
## 0 0 0 0
print(paste("Jumlah baris sebelum dibersihkan:",nrow(flights)))
## [1] "Jumlah baris sebelum dibersihkan: 336776"
print(paste("Jumlah baris setelah dibersihkan:",nrow(flights_clean)))
## [1] "Jumlah baris setelah dibersihkan: 327346"
bisa dilihat bahwa sebanyak 59607 baris data telah dihilangkan
Lalu kita coba cek outlier pada variabel air_time
summary(flights_clean$air_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 82.0 129.0 150.7 192.0 695.0
boxplot(flights_clean$air_time)
didapat bahwa ternyata air_time memiliki banyak outlier, maka agar
distribusi lebih normal kita gunakan metode IQR untuk menghapus nilai
ekstrem pada data
Q1 <- quantile(flights_clean$air_time, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$air_time, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
flights_clean <- flights_clean %>%
filter(air_time >= (Q1 - 1.5*IQR) & air_time <= (Q3 + 1.5*IQR))
summary(flights_clean$air_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 81.0 128.0 146.5 186.0 357.0
boxplot(flights_clean$air_time, main="Cleaned")
Kita coba juga pada variabel distance
boxplot(flights_clean$distance, main = "Boxplot of Distance")
dengan metode yang sama kita akan menghilangkan nilai ekstrem pada
distance
Q1 <- quantile(flights_clean$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
flights_clean<- flights_clean %>%
filter(distance >= (Q1 - 1.5 * IQR) & distance <= (Q3 + 1.5 * IQR))
boxplot(flights_clean$distance)
mengecek kategori yang tidak konsisten pada origin, didapat bahwa semuanya konsisten
unique(flights_clean$origin)
## [1] "EWR" "LGA" "JFK"
mengecek tipe data yang salah, didapat bahwa semuanya benar
str(flights_clean)
## tibble [321,898 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:321898] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:321898] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:321898] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:321898] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:321898] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:321898] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:321898] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:321898] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:321898] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:321898] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:321898] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:321898] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:321898] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:321898] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:321898] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:321898] 1400 1416 1089 1576 762 ...
## $ hour : num [1:321898] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:321898] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:321898], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
Summary dari data yang telah di clean
summary(flights_clean)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1356
## Mean :2013 Mean : 6.558 Mean :15.76 Mean :1349 Mean :1341
## 3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.0 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.0 1st Qu.:1103 1st Qu.:1120 1st Qu.: -17.000
## Median : -2.0 Median :1534 Median :1553 Median : -5.000
## Mean : 12.6 Mean :1500 Mean :1530 Mean : 6.688
## 3rd Qu.: 11.0 3rd Qu.:1938 3rd Qu.:1941 3rd Qu.: 13.000
## Max. :1137.0 Max. :2400 Max. :2359 Max. :1127.000
## carrier flight tailnum origin
## Length:321898 Min. : 1 Length:321898 Length:321898
## Class :character 1st Qu.: 572 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1967
## 3rd Qu.:3443
## Max. :8500
## dest air_time distance hour
## Length:321898 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 81.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :128.0 Median : 872 Median :13.00
## Mean :146.5 Mean :1018 Mean :13.14
## 3rd Qu.:186.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :357.0 Max. :2586 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 11:00:00.00
## Median :29.00 Median :2013-07-04 08:00:00.00
## Mean :26.27 Mean :2013-07-03 13:31:08.83
## 3rd Qu.:44.00 3rd Qu.:2013-09-30 19:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00