library(readxl)
flights = read_excel("C:/Users/togid/Downloads/1657873325_flightdelays.xlsx")
The dataset includes flights from January to May 2004, with origins like BWI, DCA, IAD, JFK, LGA, and EWR, and destinations like JFK, DCA, IAD, and BWI. It captures scheduled and actual departure times, allowing us to calculate delays.
## tibble [2,201 × 13] (S3: tbl_df/tbl/data.frame)
## $ schedtime : num [1:2201] 1455 1640 1245 1715 1039 ...
## $ carrier : chr [1:2201] "OH" "DH" "DH" "DH" ...
## $ deptime : num [1:2201] 1455 1640 1245 1709 1035 ...
## $ dest : chr [1:2201] "JFK" "JFK" "LGA" "LGA" ...
## $ distance : num [1:2201] 184 213 229 229 229 228 228 228 228 228 ...
## $ date : chr [1:2201] "37987" "37987" "37987" "37987" ...
## $ flightnumber: num [1:2201] 5935 6155 7208 7215 7792 ...
## $ origin : chr [1:2201] "BWI" "DCA" "IAD" "IAD" ...
## $ weather : num [1:2201] 0 0 0 0 0 0 0 0 0 0 ...
## $ dayweek : num [1:2201] 4 4 4 4 4 4 4 4 4 4 ...
## $ daymonth : num [1:2201] 1 1 1 1 1 1 1 1 1 1 ...
## $ tailnu : chr [1:2201] "N940CA" "N405FJ" "N695BR" "N662BR" ...
## $ delay : chr [1:2201] "ontime" "ontime" "ontime" "ontime" ...
## schedtime carrier deptime dest
## Min. : 600 Length:2201 Min. : 10 Length:2201
## 1st Qu.:1000 Class :character 1st Qu.:1004 Class :character
## Median :1455 Mode :character Median :1450 Mode :character
## Mean :1372 Mean :1369
## 3rd Qu.:1710 3rd Qu.:1709
## Max. :2130 Max. :2330
## distance date flightnumber origin
## Min. :169.0 Length:2201 Min. : 746 Length:2201
## 1st Qu.:213.0 Class :character 1st Qu.:2156 Class :character
## Median :214.0 Mode :character Median :2385 Mode :character
## Mean :211.9 Mean :3815
## 3rd Qu.:214.0 3rd Qu.:6155
## Max. :229.0 Max. :7924
## weather dayweek daymonth tailnu
## Min. :0.00000 Min. :1.000 Min. : 1.00 Length:2201
## 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.: 8.00 Class :character
## Median :0.00000 Median :4.000 Median :16.00 Mode :character
## Mean :0.01454 Mean :3.905 Mean :16.02
## 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:23.00
## Max. :1.00000 Max. :7.000 Max. :31.00
## delay
## Length:2201
## Class :character
## Mode :character
##
##
##
# Check for missing values
colSums(is.na(flights))
## schedtime carrier deptime dest distance date
## 0 0 0 0 0 0
## flightnumber origin weather dayweek daymonth tailnu
## 0 0 0 0 0 0
## delay
## 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Numeric columns
summary(flights[, c("schedtime", "deptime", "distance", "weather", "dayweek", "daymonth")])
## schedtime deptime distance weather
## Min. : 600 Min. : 10 Min. :169.0 Min. :0.00000
## 1st Qu.:1000 1st Qu.:1004 1st Qu.:213.0 1st Qu.:0.00000
## Median :1455 Median :1450 Median :214.0 Median :0.00000
## Mean :1372 Mean :1369 Mean :211.9 Mean :0.01454
## 3rd Qu.:1710 3rd Qu.:1709 3rd Qu.:214.0 3rd Qu.:0.00000
## Max. :2130 Max. :2330 Max. :229.0 Max. :1.00000
## dayweek daymonth
## Min. :1.000 Min. : 1.00
## 1st Qu.:2.000 1st Qu.: 8.00
## Median :4.000 Median :16.00
## Mean :3.905 Mean :16.02
## 3rd Qu.:5.000 3rd Qu.:23.00
## Max. :7.000 Max. :31.00
# Categorical columns
table(flights$carrier)
##
## CO DH DL MQ OH RU UA US
## 94 551 388 295 30 408 31 404
table(flights$dest)
##
## EWR JFK LGA
## 665 386 1150
table(flights$origin)
##
## BWI DCA IAD
## 145 1370 686
table(flights$delay)
##
## delayed ontime
## 428 1773
# Histogram of scheduled time
ggplot(flights, aes(x = schedtime)) +
geom_histogram(binwidth = 100) +
labs(title = "Scheduled Departure Time Distribution")
# Bar plot of carrier
ggplot(flights, aes(x = carrier)) +
geom_bar() +
labs(title = "Flights by Carrier")
# Bar plot of destination
ggplot(flights, aes(x = dest)) +
geom_bar() +
labs(title = "Flights by Destination")
# Bar plot of origin
ggplot(flights, aes(x = origin)) +
geom_bar() +
labs(title = "Flights by Origin")
# Bar plot of weather
ggplot(flights, aes(x = weather)) +
geom_bar() +
labs(title = "Weather Conditions")
# Bar plot of day of week
ggplot(flights, aes(x = dayweek)) +
geom_bar() +
labs(title = "Flights by Day of Week")
# Calculate delay in minutes
flights$delay_minutes <- with(flights, (deptime %/% 100 * 60 + deptime %% 100) -
(schedtime %/% 100 * 60 + schedtime %% 100))
flights$delay_minutes <- ifelse(flights$delay_minutes < 0,
flights$delay_minutes + 1440,
flights$delay_minutes)
# Scatter plot
ggplot(flights, aes(x = schedtime, y = deptime, color = delay)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Scheduled vs Actual Departure Time", x = "Scheduled Time", y = "Actual Time")
# Box plot
ggplot(flights, aes(x = factor(daymonth), y = delay_minutes)) +
geom_boxplot() +
labs(title = "Delay Duration by Day of Month", x = "Day of Month", y = "Delay (minutes)")
# Extract hour from schedtime
flights$sched_hour <- flights$schedtime %/% 100
# Categorize into time periods
flights$time_period <- cut(flights$sched_hour,
breaks = c(5, 11, 17, 21),
labels = c("Morning", "Afternoon", "Evening"),
include.lowest = TRUE)
# Contingency table of carrier vs delay
table_carrier_delay <- table(flights$carrier, flights$delay)
print(table_carrier_delay)
##
## delayed ontime
## CO 26 68
## DH 137 414
## DL 47 341
## MQ 80 215
## OH 4 26
## RU 94 314
## UA 5 26
## US 35 369
# Proportion table
prop.table(table_carrier_delay, 1)
##
## delayed ontime
## CO 0.27659574 0.72340426
## DH 0.24863884 0.75136116
## DL 0.12113402 0.87886598
## MQ 0.27118644 0.72881356
## OH 0.13333333 0.86666667
## RU 0.23039216 0.76960784
## UA 0.16129032 0.83870968
## US 0.08663366 0.91336634
# Categorize delay_minutes
flights$delay_category <- cut(flights$delay_minutes,
breaks = c(-Inf, 15, 60, Inf),
labels = c("On-time", "Minor Delay", "Major Delay"))
# Summary of key variables
summary(flights$delay_minutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 1434.0 896.4 1437.0 1439.0
table(flights$sched_hour)
##
## 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 126 135 162 108 100 68 142 157 236 154 178 241 85 119 53 137
table(flights$carrier)
##
## CO DH DL MQ OH RU UA US
## 94 551 388 295 30 408 31 404
table(flights$delay_category)
##
## On-time Minor Delay Major Delay
## 518 207 1476
# Histogram of delay minutes
ggplot(flights, aes(x = delay_minutes)) +
geom_histogram(binwidth = 10) +
labs(title = "Delay Duration Distribution")
# Histogram of scheduled hour
ggplot(flights, aes(x = sched_hour)) +
geom_histogram(binwidth = 1) +
labs(title = "Scheduled Hour Distribution")
# Bar plot of carrier
ggplot(flights, aes(x = carrier)) +
geom_bar() +
labs(title = "Flights by Carrier")
# Calculate delay proportions
delay_counts <- table(flights$delay)
delay_df <- as.data.frame(delay_counts)
# Pie chart
ggplot(delay_df, aes(x = "", y = Freq, fill = Var1)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Proportion of Flights Delayed") +
theme_void()
Most flights (~80%) are on-time, with delays more frequent in the afternoon/evening. Carriers DH and RU have higher delay rates; DL and US are more punctual. Weather rarely impacts delays (mostly 0). Delays peak on specific days (e.g., Feb 1, Apr 1) with outliers >60 minutes.