##Data Issue Assignment In this case, we are using flights dataset. ###Importing dataset and library From this code below, we know that this dataset contains 336.776 rows and 19 column with 8 variables in integer, 7 variables in double, 4 variables in char.

glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
how_many_int <- sum(sapply(flights, is.integer))
how_many_dbl <- sum(sapply(flights, is.double))
how_many_chr <- sum(sapply(flights, is.character))
print(how_many_chr)
## [1] 4

###Checking missing values This function tells us that we have 8255 rows with missing values in dep_time and dep_delay, 8713 rows in arr_time, 9430 rows in arr_delay and air_time, and 2512 rows in tailnum.

flights %>%
  summarise((across(everything(), ~sum(is.na(.)))))
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <int>    <int>          <int>
## 1     0     0     0     8255              0      8255     8713              0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## #   tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## #   hour <int>, minute <int>, time_hour <int>

Let’s erase that and we’re good.

flights_nona <- drop_na(flights)
flights_nona
## # A tibble: 327,346 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 327,336 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

###Checking and viewing duplicate rows Currently we have 0 dupicate rows, it is good!

flights_nona %>%
  duplicated() %>%
  sum()
## [1] 0
flights_nona %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

###Checking outliers It is good to make boxplot first in case we actually do not have outlier. Let’s go by Departure Delay first!

ggplot(flights_nona, aes(y = dep_delay)) +
geom_boxplot(outlier.colour = 'navy') +
labs( title = "Departure Delay Distribution",
      y = "Departure Delay (minutes)"
  )

Next is Arrival Delay:

ggplot(flights_nona, aes(y = arr_delay)) +
geom_boxplot(outlier.colour = 'navy') +
labs( title = "Arrival Delay Distribution",
      y = "Arrival Delay (minutes)"
  )

For Air Time:

ggplot(flights_nona, aes(y = air_time)) +
  geom_boxplot(outlier.color="navy") +
  labs(
    title = "Air Time Distributions",
    y = "Air Time (minutes)"
  )

It seems that we have so many outliers :( Let’s try to search the IQR and filters the outliers for Departure Delay.

Q1 <- quantile(flights_nona$dep_delay, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_nona$dep_delay, 0.75, na.rm = TRUE)
 IQR <- Q3 - Q1
 
flights_nona_noutlier <- flights_nona %>%
  filter(dep_delay < (Q1 - 1.5 * IQR) | dep_delay > (Q3 + 1.5 * IQR))

flights_nona_noutlier
## # A tibble: 42,857 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      732            645        47     1011            941
##  2  2013     1     1      749            710        39      939            850
##  3  2013     1     1      811            630       101     1047            830
##  4  2013     1     1      826            715        71     1136           1045
##  5  2013     1     1      848           1835       853     1001           1950
##  6  2013     1     1      903            820        43     1045            955
##  7  2013     1     1      909            810        59     1331           1315
##  8  2013     1     1      957            733       144     1056            853
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1120            944        96     1331           1213
## # ℹ 42,847 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

###Checking incorrect data types

flights_nona_noutlier %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
##   year    month  day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <chr>   <chr>  <chr> <chr>    <chr>          <chr>     <chr>    <chr>         
## 1 integer integ… inte… integer  integer        numeric   integer  integer       
## 2 integer integ… inte… integer  integer        numeric   integer  integer       
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## #   hour <chr>, minute <chr>, time_hour <chr>

###Checking logical inconsistencies In this case, departure delay coming from dep_time - sched_dep_time. So, if dep delay is != dep_time - sched_dep_time, there must be calculation error. And the same goes for arrival delays.

flights_nona_noutlier_newest <- flights_nona_noutlier %>% 
  filter(dep_delay == (dep_time - sched_dep_time))

flights_nona_noutlier_newest
## # A tibble: 4,486 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      749            710        39      939            850
##  2  2013     1     1     1253           1212        41     1524           1436
##  3  2013     1     1     1255           1200        55     1451           1330
##  4  2013     1     1     1355           1315        40     1538           1452
##  5  2013     1     1     1757           1703        54     1904           1813
##  6  2013     1     1     2056           2004        52     2156           2112
##  7  2013     1     2      839            800        39     1205           1144
##  8  2013     1     2      850            804        46     1112           1007
##  9  2013     1     2      951            900        51     1157           1045
## 10  2013     1     2     1236           1200        36     1417           1330
## # ℹ 4,476 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

##Finished Check out the final data:

summary(flights_nona_noutlier_newest)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   : 544   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 9.00   1st Qu.:1253   1st Qu.:1205  
##  Median :2013   Median : 6.000   Median :16.00   Median :1657   Median :1615  
##  Mean   :2013   Mean   : 6.338   Mean   :15.81   Mean   :1580   Mean   :1535  
##  3rd Qu.:2013   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:1941   3rd Qu.:1900  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2357   Max.   :2305  
##    dep_delay         arr_time    sched_arr_time   arr_delay     
##  Min.   :-33.00   Min.   :   1   Min.   :   1   Min.   :-58.00  
##  1st Qu.: 39.00   1st Qu.:1407   1st Qu.:1404   1st Qu.: 27.00  
##  Median : 43.00   Median :1849   Median :1830   Median : 38.00  
##  Mean   : 44.41   Mean   :1701   Mean   :1704   Mean   : 40.16  
##  3rd Qu.: 49.00   3rd Qu.:2115   3rd Qu.:2049   3rd Qu.: 51.00  
##  Max.   : 59.00   Max.   :2400   Max.   :2359   Max.   :197.00  
##    carrier              flight       tailnum             origin         
##  Length:4486        Min.   :   1   Length:4486        Length:4486       
##  Class :character   1st Qu.: 654   Class :character   Class :character  
##  Mode  :character   Median :1625   Mode  :character   Mode  :character  
##                     Mean   :2110                                        
##                     3rd Qu.:3718                                        
##                     Max.   :6181                                        
##      dest              air_time         distance           hour      
##  Length:4486        Min.   : 20.00   Min.   :  80.0   Min.   : 5.00  
##  Class :character   1st Qu.: 75.25   1st Qu.: 479.0   1st Qu.:12.00  
##  Mode  :character   Median :118.00   Median : 760.0   Median :16.00  
##                     Mean   :136.12   Mean   : 931.1   Mean   :15.31  
##                     3rd Qu.:162.00   3rd Qu.:1089.0   3rd Qu.:19.00  
##                     Max.   :635.00   Max.   :4983.0   Max.   :23.00  
##      minute         time_hour                     
##  Min.   : 0.000   Min.   :2013-01-01 07:00:00.00  
##  1st Qu.: 0.000   1st Qu.:2013-04-02 13:30:00.00  
##  Median : 1.000   Median :2013-06-18 21:00:00.00  
##  Mean   : 4.836   Mean   :2013-06-26 23:00:21.67  
##  3rd Qu.:10.000   3rd Qu.:2013-09-19 17:30:00.00  
##  Max.   :55.000   Max.   :2013-12-31 18:00:00.00