Read the Dataset

library(readxl)
flights = read_excel("C:/Users/togid/Downloads/1657873325_flightdelays.xlsx")

Understand the Data

The dataset includes flights from January to May 2004, with origins like BWI, DCA, IAD, JFK, LGA, and EWR, and destinations like JFK, DCA, IAD, and BWI. It captures scheduled and actual departure times, allowing us to calculate delays.

## tibble [2,201 × 13] (S3: tbl_df/tbl/data.frame)
##  $ schedtime   : num [1:2201] 1455 1640 1245 1715 1039 ...
##  $ carrier     : chr [1:2201] "OH" "DH" "DH" "DH" ...
##  $ deptime     : num [1:2201] 1455 1640 1245 1709 1035 ...
##  $ dest        : chr [1:2201] "JFK" "JFK" "LGA" "LGA" ...
##  $ distance    : num [1:2201] 184 213 229 229 229 228 228 228 228 228 ...
##  $ date        : chr [1:2201] "37987" "37987" "37987" "37987" ...
##  $ flightnumber: num [1:2201] 5935 6155 7208 7215 7792 ...
##  $ origin      : chr [1:2201] "BWI" "DCA" "IAD" "IAD" ...
##  $ weather     : num [1:2201] 0 0 0 0 0 0 0 0 0 0 ...
##  $ dayweek     : num [1:2201] 4 4 4 4 4 4 4 4 4 4 ...
##  $ daymonth    : num [1:2201] 1 1 1 1 1 1 1 1 1 1 ...
##  $ tailnu      : chr [1:2201] "N940CA" "N405FJ" "N695BR" "N662BR" ...
##  $ delay       : chr [1:2201] "ontime" "ontime" "ontime" "ontime" ...
##    schedtime      carrier             deptime         dest          
##  Min.   : 600   Length:2201        Min.   :  10   Length:2201       
##  1st Qu.:1000   Class :character   1st Qu.:1004   Class :character  
##  Median :1455   Mode  :character   Median :1450   Mode  :character  
##  Mean   :1372                      Mean   :1369                     
##  3rd Qu.:1710                      3rd Qu.:1709                     
##  Max.   :2130                      Max.   :2330                     
##     distance         date            flightnumber     origin         
##  Min.   :169.0   Length:2201        Min.   : 746   Length:2201       
##  1st Qu.:213.0   Class :character   1st Qu.:2156   Class :character  
##  Median :214.0   Mode  :character   Median :2385   Mode  :character  
##  Mean   :211.9                      Mean   :3815                     
##  3rd Qu.:214.0                      3rd Qu.:6155                     
##  Max.   :229.0                      Max.   :7924                     
##     weather           dayweek         daymonth        tailnu         
##  Min.   :0.00000   Min.   :1.000   Min.   : 1.00   Length:2201       
##  1st Qu.:0.00000   1st Qu.:2.000   1st Qu.: 8.00   Class :character  
##  Median :0.00000   Median :4.000   Median :16.00   Mode  :character  
##  Mean   :0.01454   Mean   :3.905   Mean   :16.02                     
##  3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:23.00                     
##  Max.   :1.00000   Max.   :7.000   Max.   :31.00                     
##     delay          
##  Length:2201       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Find Null Values

# Check for missing values
colSums(is.na(flights))
##    schedtime      carrier      deptime         dest     distance         date 
##            0            0            0            0            0            0 
## flightnumber       origin      weather      dayweek     daymonth       tailnu 
##            0            0            0            0            0            0 
##        delay 
##            0

Install Required Packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Summary of Descriptive Statistics

# Numeric columns
summary(flights[, c("schedtime", "deptime", "distance", "weather", "dayweek", "daymonth")])
##    schedtime       deptime        distance        weather       
##  Min.   : 600   Min.   :  10   Min.   :169.0   Min.   :0.00000  
##  1st Qu.:1000   1st Qu.:1004   1st Qu.:213.0   1st Qu.:0.00000  
##  Median :1455   Median :1450   Median :214.0   Median :0.00000  
##  Mean   :1372   Mean   :1369   Mean   :211.9   Mean   :0.01454  
##  3rd Qu.:1710   3rd Qu.:1709   3rd Qu.:214.0   3rd Qu.:0.00000  
##  Max.   :2130   Max.   :2330   Max.   :229.0   Max.   :1.00000  
##     dayweek         daymonth    
##  Min.   :1.000   Min.   : 1.00  
##  1st Qu.:2.000   1st Qu.: 8.00  
##  Median :4.000   Median :16.00  
##  Mean   :3.905   Mean   :16.02  
##  3rd Qu.:5.000   3rd Qu.:23.00  
##  Max.   :7.000   Max.   :31.00
# Categorical columns
table(flights$carrier)
## 
##  CO  DH  DL  MQ  OH  RU  UA  US 
##  94 551 388 295  30 408  31 404
table(flights$dest)
## 
##  EWR  JFK  LGA 
##  665  386 1150
table(flights$origin)
## 
##  BWI  DCA  IAD 
##  145 1370  686
table(flights$delay)
## 
## delayed  ontime 
##     428    1773

Histograms to Understand Relationships

# Histogram of scheduled time
ggplot(flights, aes(x = schedtime)) +
  geom_histogram(binwidth = 100) +
  labs(title = "Scheduled Departure Time Distribution")

# Bar plot of carrier
ggplot(flights, aes(x = carrier)) +
  geom_bar() +
  labs(title = "Flights by Carrier")

# Bar plot of destination
ggplot(flights, aes(x = dest)) +
  geom_bar() +
  labs(title = "Flights by Destination")

# Bar plot of origin
ggplot(flights, aes(x = origin)) +
  geom_bar() +
  labs(title = "Flights by Origin")

# Bar plot of weather
ggplot(flights, aes(x = weather)) +
  geom_bar() +
  labs(title = "Weather Conditions")

# Bar plot of day of week
ggplot(flights, aes(x = dayweek)) +
  geom_bar() +
  labs(title = "Flights by Day of Week")

Scatter Plot for Flights On-Time and Delayed

# Calculate delay in minutes
flights$delay_minutes <- with(flights, (deptime %/% 100 * 60 + deptime %% 100) - 
                              (schedtime %/% 100 * 60 + schedtime %% 100))
flights$delay_minutes <- ifelse(flights$delay_minutes < 0, 
                                flights$delay_minutes + 1440, 
                                flights$delay_minutes)

# Scatter plot
ggplot(flights, aes(x = schedtime, y = deptime, color = delay)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Scheduled vs Actual Departure Time", x = "Scheduled Time", y = "Actual Time")

Box Plot for Delay by Day of Month

# Box plot
ggplot(flights, aes(x = factor(daymonth), y = delay_minutes)) +
  geom_boxplot() +
  labs(title = "Delay Duration by Day of Month", x = "Day of Month", y = "Delay (minutes)")

Define Hours of Departure

# Extract hour from schedtime
flights$sched_hour <- flights$schedtime %/% 100

# Categorize into time periods
flights$time_period <- cut(flights$sched_hour, 
                           breaks = c(5, 11, 17, 21), 
                           labels = c("Morning", "Afternoon", "Evening"), 
                           include.lowest = TRUE)
# Contingency table of carrier vs delay
table_carrier_delay <- table(flights$carrier, flights$delay)
print(table_carrier_delay)
##     
##      delayed ontime
##   CO      26     68
##   DH     137    414
##   DL      47    341
##   MQ      80    215
##   OH       4     26
##   RU      94    314
##   UA       5     26
##   US      35    369
# Proportion table
prop.table(table_carrier_delay, 1)
##     
##         delayed     ontime
##   CO 0.27659574 0.72340426
##   DH 0.24863884 0.75136116
##   DL 0.12113402 0.87886598
##   MQ 0.27118644 0.72881356
##   OH 0.13333333 0.86666667
##   RU 0.23039216 0.76960784
##   UA 0.16129032 0.83870968
##   US 0.08663366 0.91336634

Redefine Delay Variables

# Categorize delay_minutes
flights$delay_category <- cut(flights$delay_minutes, 
                              breaks = c(-Inf, 15, 60, Inf), 
                              labels = c("On-time", "Minor Delay", "Major Delay"))

Summary of Major Variables

# Summary of key variables
summary(flights$delay_minutes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0  1434.0   896.4  1437.0  1439.0
table(flights$sched_hour)
## 
##   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21 
## 126 135 162 108 100  68 142 157 236 154 178 241  85 119  53 137
table(flights$carrier)
## 
##  CO  DH  DL  MQ  OH  RU  UA  US 
##  94 551 388 295  30 408  31 404
table(flights$delay_category)
## 
##     On-time Minor Delay Major Delay 
##         518         207        1476

Histograms of Major Variables

# Histogram of delay minutes
ggplot(flights, aes(x = delay_minutes)) +
  geom_histogram(binwidth = 10) +
  labs(title = "Delay Duration Distribution")

# Histogram of scheduled hour
ggplot(flights, aes(x = sched_hour)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Scheduled Hour Distribution")

# Bar plot of carrier
ggplot(flights, aes(x = carrier)) +
  geom_bar() +
  labs(title = "Flights by Carrier")

Pie Chart for Delayed Flights

# Calculate delay proportions
delay_counts <- table(flights$delay)
delay_df <- as.data.frame(delay_counts)

# Pie chart
ggplot(delay_df, aes(x = "", y = Freq, fill = Var1)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Proportion of Flights Delayed") +
  theme_void()

Conclusion

Most flights (~80%) are on-time, with delays more frequent in the afternoon/evening. Carriers DH and RU have higher delay rates; DL and US are more punctual. Weather rarely impacts delays (mostly 0). Delays peak on specific days (e.g., Feb 1, Apr 1) with outliers >60 minutes.