###Visualization of Missing Value per Column (Before Cleaning)
flights %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "missing") %>%
ggplot(aes(x = reorder(variable, -missing), y = missing)) +
geom_col(fill = "cyan") +
coord_flip() +
labs(title = "Missing Values Per Column (Before Cleaning)",
x = "Variables", y = "Missing Count")

###Checking for Duplicated
flights %>%
duplicated() %>%
sum()
## [1] 0
flights %>%
filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
###Checking Outlier for Column "Distance"
ggplot(flights, aes(y=distance)) +
geom_boxplot()

Q1 <- quantile(flights$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
outliers <- flights %>%
filter(distance < (Q1 - 1.5 * IQR) | distance > (Q3 + 1.5 * IQR))
###Visualization of Departure Delay (Before Cleaning)
ggplot(flights, aes(x = dep_delay)) +
geom_histogram(bins = 50, fill = "red", color = "black") +
xlim(-50, 300) +
labs(title = "Departure Delay Distribution (Before Cleaning)",
x = "Departure Delay (minutes)", y = "Count")
## Warning: Removed 8865 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Arrival Delay (Before Cleaning)
ggplot(flights, aes(x = arr_delay)) +
geom_histogram(bins = 50, fill = "yellow", color = "black") +
xlim(-50, 300) +
labs(title = "Arrival Delay Distribution (Before Cleaning)",
x = "Arrival Delay (minutes)", y = "Count")
## Warning: Removed 11152 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Departure Time VS Arrival Time (Before Cleaning)
ggplot(flights, aes(x = dep_time, y = arr_time)) +
geom_point(alpha = 0.1, color = "orange") +
labs(title = "Departure Time vs Arrival Time (Before Cleaning)",
x = "Departure Time", y = "Arrival Time")
## Warning: Removed 8713 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visualization of Departure Delay VS Arrival Delay (Before Cleaning)
ggplot(flights, aes(x = dep_delay, y = arr_delay)) +
geom_point(alpha = 0.1, color = "blue") +
labs(title = "Departure Delay vs Arrival Delay (Before Cleaning)",
x = "Departure Delay", y = "Arrival Delay")
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visualization of Distance VS Air Time (Before Cleaning)
ggplot(flights, aes(x = distance, y = air_time)) +
geom_point(alpha = 0.1, color = "brown") +
labs(title = "Distance vs Air Time (Before Cleaning)",
x = "Distance (miles)", y = "Air Time (minutes)")
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visulatization of Flight Count by Carrier (Before Cleaning)
flights %>%
count(carrier) %>%
ggplot(aes(x = reorder(carrier, n), y = n)) +
geom_col(fill = "maroon") +
coord_flip() +
labs(title = "Flight Count by Carrier (Before Cleaning)",
x = "Carrier", y = "Flights")

summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
## Median :29.00 Median :2013-07-03 10:00:00.00
## Mean :26.23 Mean :2013-07-03 05:22:54.64
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
##
###Data Cleaning
flights_clean <- flights %>%
drop_na(dep_time, arr_time, air_time, distance, dep_delay, arr_delay) %>%
filter(air_time > 0,
dep_delay >= -5 & dep_delay <= 1000,
arr_delay >= -5 & arr_delay <= 1000) %>%
distinct() %>%
mutate(speed_mph = distance / (air_time / 60))
###Visualization of Clearing The Missing Value per Column (After Cleaning)
flights_clean %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "missing") %>%
ggplot(aes(x = reorder(variable, -missing), y = missing)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Missing Values Per Column (After Cleaning)",
x = "Variables", y = "Missing Count")

###Data After Cleaning
ggplot(flights_clean, aes(y=distance))+
geom_boxplot()

Q1 <- quantile(flights_clean$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
flights_clean %>%
filter(distance < (Q1 - 1.5 * IQR) |
distance > (Q3 + 1.5 * IQR))
## # A tibble: 292 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1344 1344 0 2005 1944
## 2 2013 1 2 909 900 9 1525 1530
## 3 2013 1 2 1344 1344 0 1940 1944
## 4 2013 1 3 1418 1341 37 2006 1935
## 5 2013 1 4 1343 1341 2 1932 1935
## 6 2013 1 6 1019 900 79 1558 1530
## 7 2013 1 7 1042 900 102 1620 1530
## 8 2013 1 8 1344 1341 3 1951 1935
## 9 2013 1 9 1340 1341 -1 2019 1935
## 10 2013 1 10 1342 1341 1 1935 1935
## # ℹ 282 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, speed_mph <dbl>
###Visualization of Departure Delay (After Cleaning)
ggplot(flights_clean, aes(x = dep_delay)) +
geom_histogram(bins = 50, fill = "lightblue", color = "blue") +
xlim(-50, 300) +
labs(title = "Departure Delay Distribution (After Cleaning)",
x = "Departure Delay (minutes)", y = "Count")
## Warning: Removed 596 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Arrival Delay (After Cleaning)
ggplot(flights_clean, aes(x = arr_delay)) +
geom_histogram(bins = 50, fill = "orange", color = "red") +
xlim(-50, 300) +
labs(title = "Arrival Delay Distribution (After Cleaning)",
x = "Arrival Delay (minutes)", y = "Count")
## Warning: Removed 606 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#Visualization of Departure Time VS Arrival Time (After Cleaning)
ggplot(flights_clean, aes(x = dep_time, y = arr_time)) +
geom_point(alpha = 0.1, color = "blue") +
labs(title = "Departure Time vs Arrival Time (After Cleaning)",
x = "Departure Time", y = "Arrival Time")

###Visualization of Departure Delay VS Arrival Delay (After Cleaning)
ggplot(flights_clean, aes(x = dep_delay, y = arr_delay)) +
geom_point(alpha = 0.1, color = "darkorange") +
labs(title = "Departure Delay vs Arrival Delay (After Cleaning)",
x = "Departure Delay", y = "Arrival Delay")

###Visualization of Distance VS Air Time (After Cleaning)
ggplot(flights_clean, aes(x = distance, y = air_time)) +
geom_point(alpha = 0.1, color = "cyan") +
labs(title = "Distance vs Air Time (After Cleaning)",
x = "Distance (miles)", y = "Air Time (minutes)")

###Visualization of Flight Count by Carrier (After Cleaning)
flights_clean %>%
count(carrier) %>%
ggplot(aes(x = reorder(carrier, n), y = n)) +
geom_col(fill = "blue") +
coord_flip() +
labs(title = "Flight Count by Carrier (After Cleaning)",
x = "Carrier", y = "Flights")

summary (flights_clean)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 9.00 1st Qu.:1027 1st Qu.:1015
## Median :2013 Median : 6.000 Median :16.00 Median :1520 Median :1459
## Mean :2013 Mean : 6.519 Mean :15.73 Mean :1446 Mean :1415
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1836 3rd Qu.:1800
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -5.00 Min. : 1 Min. : 1 Min. : -5.00
## 1st Qu.: -1.00 1st Qu.:1147 1st Qu.:1214 1st Qu.: 3.00
## Median : 12.00 Median :1650 Median :1657 Median : 15.00
## Mean : 31.39 Mean :1548 Mean :1593 Mean : 34.25
## 3rd Qu.: 42.00 3rd Qu.:2023 3rd Qu.:2015 3rd Qu.: 44.00
## Max. :960.00 Max. :2400 Max. :2359 Max. :931.00
## carrier flight tailnum origin
## Length:150664 Min. : 1 Length:150664 Length:150664
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1485 Mode :character Mode :character
## Mean :2004
## 3rd Qu.:3572
## Max. :8500
## dest air_time distance hour
## Length:150664 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 85.0 1st Qu.: 529 1st Qu.:10.00
## Mode :character Median :130.0 Median : 866 Median :14.00
## Mean :152.2 Mean :1034 Mean :13.88
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:18.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour speed_mph
## Min. : 0.00 Min. :2013-01-01 05:00:00.00 Min. : 76.8
## 1st Qu.:10.00 1st Qu.:2013-04-03 13:00:00.00 1st Qu.:351.1
## Median :29.00 Median :2013-06-30 07:00:00.00 Median :394.4
## Mean :26.89 Mean :2013-07-02 08:34:34.77 Mean :386.0
## 3rd Qu.:45.00 3rd Qu.:2013-10-03 19:00:00.00 3rd Qu.:429.1
## Max. :59.00 Max. :2013-12-31 23:00:00.00 Max. :650.3