##Data Issue Assignment In this case, we are using flights dataset. ###Importing dataset and library From this code below, we know that this dataset contains 336.776 rows and 19 column with 8 variables in integer, 7 variables in double, 4 variables in char.
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
how_many_int <- sum(sapply(flights, is.integer))
how_many_dbl <- sum(sapply(flights, is.double))
how_many_chr <- sum(sapply(flights, is.character))
print(how_many_chr)
## [1] 4
###Checking missing values This function tells us that we have 8255 rows with missing values in dep_time and dep_delay, 8713 rows in arr_time, 9430 rows in arr_delay and air_time, and 2512 rows in tailnum.
flights %>%
summarise((across(everything(), ~sum(is.na(.)))))
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 8255 0 8255 8713 0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## # tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## # hour <int>, minute <int>, time_hour <int>
Let’s erase that and we’re good.
flights_nona <- drop_na(flights)
flights_nona
## # A tibble: 327,346 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 327,336 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
###Checking and viewing duplicate rows Currently we have 0 dupicate rows, it is good!
flights_nona %>%
duplicated() %>%
sum()
## [1] 0
flights_nona %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
###Checking outliers It is good to make boxplot first in case we actually do not have outlier. Let’s go by Departure Delay first!
ggplot(flights_nona, aes(y = dep_delay)) +
geom_boxplot(outlier.colour = 'navy') +
labs( title = "Departure Delay Distribution",
y = "Departure Delay (minutes)"
)
Next is Arrival Delay:
ggplot(flights_nona, aes(y = arr_delay)) +
geom_boxplot(outlier.colour = 'navy') +
labs( title = "Arrival Delay Distribution",
y = "Arrival Delay (minutes)"
)
For Air Time:
ggplot(flights_nona, aes(y = air_time)) +
geom_boxplot(outlier.color="navy") +
labs(
title = "Air Time Distributions",
y = "Air Time (minutes)"
)
It seems that we have so many outliers :( Let’s try to search the IQR
and filters the outliers for Departure Delay.
Q1 <- quantile(flights_nona$dep_delay, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_nona$dep_delay, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
flights_nona_noutlier <- flights_nona %>%
filter(dep_delay < (Q1 - 1.5 * IQR) | dep_delay > (Q3 + 1.5 * IQR))
flights_nona_noutlier
## # A tibble: 42,857 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 732 645 47 1011 941
## 2 2013 1 1 749 710 39 939 850
## 3 2013 1 1 811 630 101 1047 830
## 4 2013 1 1 826 715 71 1136 1045
## 5 2013 1 1 848 1835 853 1001 1950
## 6 2013 1 1 903 820 43 1045 955
## 7 2013 1 1 909 810 59 1331 1315
## 8 2013 1 1 957 733 144 1056 853
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1120 944 96 1331 1213
## # ℹ 42,847 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
###Checking incorrect data types
flights_nona_noutlier %>% summarise(across(everything(), class))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integer integ… inte… integer integer numeric integer integer
## 2 integer integ… inte… integer integer numeric integer integer
## # ℹ 11 more variables: arr_delay <chr>, carrier <chr>, flight <chr>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <chr>, distance <chr>,
## # hour <chr>, minute <chr>, time_hour <chr>
###Checking logical inconsistencies In this case, departure delay coming from dep_time - sched_dep_time. So, if dep delay is != dep_time - sched_dep_time, there must be calculation error. And the same goes for arrival delays.
flights_nona_noutlier_newest <- flights_nona_noutlier %>%
filter(dep_delay == (dep_time - sched_dep_time))
flights_nona_noutlier_newest
## # A tibble: 4,486 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 749 710 39 939 850
## 2 2013 1 1 1253 1212 41 1524 1436
## 3 2013 1 1 1255 1200 55 1451 1330
## 4 2013 1 1 1355 1315 40 1538 1452
## 5 2013 1 1 1757 1703 54 1904 1813
## 6 2013 1 1 2056 2004 52 2156 2112
## 7 2013 1 2 839 800 39 1205 1144
## 8 2013 1 2 850 804 46 1112 1007
## 9 2013 1 2 951 900 51 1157 1045
## 10 2013 1 2 1236 1200 36 1417 1330
## # ℹ 4,476 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
##Finished Check out the final data:
summary(flights_nona_noutlier_newest)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 544 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 9.00 1st Qu.:1253 1st Qu.:1205
## Median :2013 Median : 6.000 Median :16.00 Median :1657 Median :1615
## Mean :2013 Mean : 6.338 Mean :15.81 Mean :1580 Mean :1535
## 3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:1941 3rd Qu.:1900
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2357 Max. :2305
## dep_delay arr_time sched_arr_time arr_delay
## Min. :-33.00 Min. : 1 Min. : 1 Min. :-58.00
## 1st Qu.: 39.00 1st Qu.:1407 1st Qu.:1404 1st Qu.: 27.00
## Median : 43.00 Median :1849 Median :1830 Median : 38.00
## Mean : 44.41 Mean :1701 Mean :1704 Mean : 40.16
## 3rd Qu.: 49.00 3rd Qu.:2115 3rd Qu.:2049 3rd Qu.: 51.00
## Max. : 59.00 Max. :2400 Max. :2359 Max. :197.00
## carrier flight tailnum origin
## Length:4486 Min. : 1 Length:4486 Length:4486
## Class :character 1st Qu.: 654 Class :character Class :character
## Mode :character Median :1625 Mode :character Mode :character
## Mean :2110
## 3rd Qu.:3718
## Max. :6181
## dest air_time distance hour
## Length:4486 Min. : 20.00 Min. : 80.0 Min. : 5.00
## Class :character 1st Qu.: 75.25 1st Qu.: 479.0 1st Qu.:12.00
## Mode :character Median :118.00 Median : 760.0 Median :16.00
## Mean :136.12 Mean : 931.1 Mean :15.31
## 3rd Qu.:162.00 3rd Qu.:1089.0 3rd Qu.:19.00
## Max. :635.00 Max. :4983.0 Max. :23.00
## minute time_hour
## Min. : 0.000 Min. :2013-01-01 07:00:00.00
## 1st Qu.: 0.000 1st Qu.:2013-04-02 13:30:00.00
## Median : 1.000 Median :2013-06-18 21:00:00.00
## Mean : 4.836 Mean :2013-06-26 23:00:21.67
## 3rd Qu.:10.000 3rd Qu.:2013-09-19 17:30:00.00
## Max. :55.000 Max. :2013-12-31 18:00:00.00