###Visualization of Missing Value per Column (Before Cleaning)
flights %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing") %>%
  ggplot(aes(x = reorder(variable, -missing), y = missing)) +
  geom_col(fill = "cyan") +
  coord_flip() +
  labs(title = "Missing Values Per Column (Before Cleaning)",
       x = "Variables", y = "Missing Count")

###Checking for Duplicated
flights %>%
  duplicated() %>%
  sum()
## [1] 0
flights %>%
  filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
###Checking Outlier for Column "Distance"
ggplot(flights, aes(y=distance)) +
  geom_boxplot()

Q1 <- quantile(flights$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
outliers <- flights %>%
  filter(distance < (Q1 - 1.5 * IQR) | distance > (Q3 + 1.5 * IQR))
###Visualization of Departure Delay (Before Cleaning)
ggplot(flights, aes(x = dep_delay)) +
  geom_histogram(bins = 50, fill = "red", color = "black") +
  xlim(-50, 300) +
  labs(title = "Departure Delay Distribution (Before Cleaning)",
       x = "Departure Delay (minutes)", y = "Count")
## Warning: Removed 8865 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Arrival Delay (Before Cleaning)
ggplot(flights, aes(x = arr_delay)) +
  geom_histogram(bins = 50, fill = "yellow", color = "black") +
  xlim(-50, 300) +
  labs(title = "Arrival Delay Distribution (Before Cleaning)",
       x = "Arrival Delay (minutes)", y = "Count")
## Warning: Removed 11152 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Departure Time VS Arrival Time (Before Cleaning)
ggplot(flights, aes(x = dep_time, y = arr_time)) +
  geom_point(alpha = 0.1, color = "orange") +
  labs(title = "Departure Time vs Arrival Time (Before Cleaning)",
       x = "Departure Time", y = "Arrival Time")
## Warning: Removed 8713 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visualization of Departure Delay VS Arrival Delay (Before Cleaning)
ggplot(flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(alpha = 0.1, color = "blue") +
  labs(title = "Departure Delay vs Arrival Delay (Before Cleaning)",
       x = "Departure Delay", y = "Arrival Delay")
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visualization of Distance VS Air Time (Before Cleaning)
ggplot(flights, aes(x = distance, y = air_time)) +
  geom_point(alpha = 0.1, color = "brown") +
  labs(title = "Distance vs Air Time (Before Cleaning)",
       x = "Distance (miles)", y = "Air Time (minutes)")
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Visulatization of Flight Count by Carrier (Before Cleaning)
flights %>%
  count(carrier) %>%
  ggplot(aes(x = reorder(carrier, n), y = n)) +
  geom_col(fill = "maroon") +
  coord_flip() +
  labs(title = "Flight Count by Carrier (Before Cleaning)", 
       x = "Carrier", y = "Flights")

summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
##  Median :29.00   Median :2013-07-03 10:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
## 
###Data Cleaning
flights_clean <- flights %>%
  drop_na(dep_time, arr_time, air_time, distance, dep_delay, arr_delay) %>%
  filter(air_time > 0,
         dep_delay >= -5 & dep_delay <= 1000,
         arr_delay >= -5 & arr_delay <= 1000) %>%
  distinct() %>%
  mutate(speed_mph = distance / (air_time / 60))
###Visualization of Clearing The Missing Value per Column (After Cleaning)
flights_clean %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing") %>%
  ggplot(aes(x = reorder(variable, -missing), y = missing)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Missing Values Per Column (After Cleaning)",
       x = "Variables", y = "Missing Count")

###Data After Cleaning
ggplot(flights_clean, aes(y=distance))+
  geom_boxplot()

Q1 <- quantile(flights_clean$distance, 0.25, na.rm = TRUE)
Q3 <- quantile(flights_clean$distance, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
flights_clean %>%
  filter(distance < (Q1 - 1.5 * IQR) |
           distance > (Q3 + 1.5 * IQR))
## # A tibble: 292 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1344           1344         0     2005           1944
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     2     1344           1344         0     1940           1944
##  4  2013     1     3     1418           1341        37     2006           1935
##  5  2013     1     4     1343           1341         2     1932           1935
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8     1344           1341         3     1951           1935
##  9  2013     1     9     1340           1341        -1     2019           1935
## 10  2013     1    10     1342           1341         1     1935           1935
## # ℹ 282 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, speed_mph <dbl>
###Visualization of Departure Delay (After Cleaning)
ggplot(flights_clean, aes(x = dep_delay)) +
  geom_histogram(bins = 50, fill = "lightblue", color = "blue") +
  xlim(-50, 300) +
  labs(title = "Departure Delay Distribution (After Cleaning)",
       x = "Departure Delay (minutes)", y = "Count")
## Warning: Removed 596 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Visualization of Arrival Delay (After Cleaning)
ggplot(flights_clean, aes(x = arr_delay)) +
  geom_histogram(bins = 50, fill = "orange", color = "red") +
  xlim(-50, 300) +
  labs(title = "Arrival Delay Distribution (After Cleaning)",
       x = "Arrival Delay (minutes)", y = "Count")
## Warning: Removed 606 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#Visualization of Departure Time VS Arrival Time (After Cleaning)
ggplot(flights_clean, aes(x = dep_time, y = arr_time)) +
  geom_point(alpha = 0.1, color = "blue") +
  labs(title = "Departure Time vs Arrival Time (After Cleaning)",
       x = "Departure Time", y = "Arrival Time")

###Visualization of Departure Delay VS Arrival Delay (After Cleaning)
ggplot(flights_clean, aes(x = dep_delay, y = arr_delay)) +
  geom_point(alpha = 0.1, color = "darkorange") +
  labs(title = "Departure Delay vs Arrival Delay (After Cleaning)",
       x = "Departure Delay", y = "Arrival Delay")

###Visualization of Distance VS Air Time (After Cleaning)
ggplot(flights_clean, aes(x = distance, y = air_time)) +
  geom_point(alpha = 0.1, color = "cyan") +
  labs(title = "Distance vs Air Time (After Cleaning)",
       x = "Distance (miles)", y = "Air Time (minutes)")

###Visualization of Flight Count by Carrier (After Cleaning)
flights_clean %>%
  count(carrier) %>%
  ggplot(aes(x = reorder(carrier, n), y = n)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(title = "Flight Count by Carrier (After Cleaning)", 
       x = "Carrier", y = "Flights")

summary (flights_clean)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 500  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 9.00   1st Qu.:1027   1st Qu.:1015  
##  Median :2013   Median : 6.000   Median :16.00   Median :1520   Median :1459  
##  Mean   :2013   Mean   : 6.519   Mean   :15.73   Mean   :1446   Mean   :1415  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1836   3rd Qu.:1800  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##    dep_delay         arr_time    sched_arr_time   arr_delay     
##  Min.   : -5.00   Min.   :   1   Min.   :   1   Min.   : -5.00  
##  1st Qu.: -1.00   1st Qu.:1147   1st Qu.:1214   1st Qu.:  3.00  
##  Median : 12.00   Median :1650   Median :1657   Median : 15.00  
##  Mean   : 31.39   Mean   :1548   Mean   :1593   Mean   : 34.25  
##  3rd Qu.: 42.00   3rd Qu.:2023   3rd Qu.:2015   3rd Qu.: 44.00  
##  Max.   :960.00   Max.   :2400   Max.   :2359   Max.   :931.00  
##    carrier              flight       tailnum             origin         
##  Length:150664      Min.   :   1   Length:150664      Length:150664     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1485   Mode  :character   Mode  :character  
##                     Mean   :2004                                        
##                     3rd Qu.:3572                                        
##                     Max.   :8500                                        
##      dest              air_time        distance         hour      
##  Length:150664      Min.   : 20.0   Min.   :  80   Min.   : 5.00  
##  Class :character   1st Qu.: 85.0   1st Qu.: 529   1st Qu.:10.00  
##  Mode  :character   Median :130.0   Median : 866   Median :14.00  
##                     Mean   :152.2   Mean   :1034   Mean   :13.88  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:18.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##      minute        time_hour                        speed_mph    
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00   Min.   : 76.8  
##  1st Qu.:10.00   1st Qu.:2013-04-03 13:00:00.00   1st Qu.:351.1  
##  Median :29.00   Median :2013-06-30 07:00:00.00   Median :394.4  
##  Mean   :26.89   Mean   :2013-07-02 08:34:34.77   Mean   :386.0  
##  3rd Qu.:45.00   3rd Qu.:2013-10-03 19:00:00.00   3rd Qu.:429.1  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00   Max.   :650.3