Nama Anggota :

-Gilbert Sambira (5052241018)

-Jason Alexander (5052241001)

-Ghisele Valerin (5052241005)

library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3

Pertama kita mulai dengan melihat data awal untuk di clean.

data("flights")
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

Setelah dilihat maka kita mulai dengan mengecek nilai NA pada tiap kolomnya, dengan menggunakan colSums kita mendapat bahwa pada kolom dep_time,arr_time,air_time,dan distance memiliki nilai NA, maka gunakan filter(!is.na) untuk menghilangkan nilai NA nya

colSums(is.na(flights))
##           year          month            day       dep_time sched_dep_time 
##              0              0              0           8255              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##           8255           8713              0           9430              0 
##         flight        tailnum         origin           dest       air_time 
##              0           2512              0              0           9430 
##       distance           hour         minute      time_hour 
##              0              0              0              0
flights_clean <- flights %>%
  filter(!is.na(dep_time),
         !is.na(arr_time),
         !is.na(air_time),
         !is.na(distance))

glimpse(flights_clean)
## Rows: 327,346
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
colSums(is.na(flights_clean))
##           year          month            day       dep_time sched_dep_time 
##              0              0              0              0              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##              0              0              0              0              0 
##         flight        tailnum         origin           dest       air_time 
##              0              0              0              0              0 
##       distance           hour         minute      time_hour 
##              0              0              0              0
print(paste("Jumlah baris sebelum dibersihkan:",nrow(flights)))
## [1] "Jumlah baris sebelum dibersihkan: 336776"
print(paste("Jumlah baris setelah dibersihkan:",nrow(flights_clean)))
## [1] "Jumlah baris setelah dibersihkan: 327346"

bisa dilihat bahwa sebanyak 59607 baris data telah dihilangkan

Lalu kita coba cek outlier pada variabel air_time

summary(flights_clean$air_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0    82.0   129.0   150.7   192.0   695.0
boxplot(flights_clean$air_time)

didapat bahwa ternyata air_time memiliki banyak outlier, maka agar distribusi lebih normal kita gunakan metode IQR untuk menghapus nilai ekstrem pada data

Q1 <- quantile(flights_clean$air_time, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$air_time, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

flights_clean <- flights_clean %>%
  filter(air_time >= (Q1 - 1.5*IQR) & air_time <= (Q3 + 1.5*IQR))
summary(flights_clean$air_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0    81.0   128.0   146.5   186.0   357.0
boxplot(flights_clean$air_time, main="Cleaned")

Kita coba juga pada variabel distance

boxplot(flights_clean$distance, main = "Boxplot of Distance")

dengan metode yang sama kita akan menghilangkan nilai ekstrem pada distance

Q1 <- quantile(flights_clean$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

flights_clean<- flights_clean %>%
  filter(distance >= (Q1 - 1.5 * IQR) & distance <= (Q3 + 1.5 * IQR))

boxplot(flights_clean$distance)

mengecek kategori yang tidak konsisten pada origin, didapat bahwa semuanya konsisten

unique(flights_clean$origin)
## [1] "EWR" "LGA" "JFK"

mengecek tipe data yang salah, didapat bahwa semuanya benar

str(flights_clean)
## tibble [321,898 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:321898] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:321898] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:321898] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:321898] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:321898] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:321898] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:321898] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:321898] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:321898] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:321898] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:321898] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:321898] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:321898] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:321898] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:321898] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:321898] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:321898] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:321898] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:321898], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Summary dari data yang telah di clean

summary(flights_clean)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 905  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1356  
##  Mean   :2013   Mean   : 6.558   Mean   :15.76   Mean   :1349   Mean   :1341  
##  3rd Qu.:2013   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay         arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.0   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.0   1st Qu.:1103   1st Qu.:1120   1st Qu.: -17.000  
##  Median :  -2.0   Median :1534   Median :1553   Median :  -5.000  
##  Mean   :  12.6   Mean   :1500   Mean   :1530   Mean   :   6.688  
##  3rd Qu.:  11.0   3rd Qu.:1938   3rd Qu.:1941   3rd Qu.:  13.000  
##  Max.   :1137.0   Max.   :2400   Max.   :2359   Max.   :1127.000  
##    carrier              flight       tailnum             origin         
##  Length:321898      Min.   :   1   Length:321898      Length:321898     
##  Class :character   1st Qu.: 572   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1967                                        
##                     3rd Qu.:3443                                        
##                     Max.   :8500                                        
##      dest              air_time        distance         hour      
##  Length:321898      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 81.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :128.0   Median : 872   Median :13.00  
##                     Mean   :146.5   Mean   :1018   Mean   :13.14  
##                     3rd Qu.:186.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :357.0   Max.   :2586   Max.   :23.00  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-05 11:00:00.00  
##  Median :29.00   Median :2013-07-04 08:00:00.00  
##  Mean   :26.27   Mean   :2013-07-03 13:31:08.83  
##  3rd Qu.:44.00   3rd Qu.:2013-09-30 19:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00