Load the library function
library(tidyverse)
library(lubridate)
library(caTools)
Read the data from csv
delay = read.csv("Delay_v3.csv")
Understand and explore the data
names(delay)
## [1] "DEP_TIME" "Airline" "Destination" "Distance"
## [5] "Flight_DATE" "Flight_NUM" "ORIGIN" "Bad_Weather"
## [9] "DAY_WEEK" "Flight.Status"
dim(delay)
## [1] 2205 10
summary(delay)
## DEP_TIME Airline Destination Distance Flight_DATE
## Min. : 600 DH :553 EWR : 668 Min. :169.0 14/1/2004: 86
## 1st Qu.:1000 RU :410 JFK : 386 1st Qu.:213.0 22/1/2004: 86
## Median :1455 US :404 LGA :1150 Median :214.0 13/1/2004: 85
## Mean :1372 DL :388 NA's: 1 Mean :211.9 20/1/2004: 85
## 3rd Qu.:1710 MQ :295 3rd Qu.:214.0 21/1/2004: 85
## Max. :2130 CO : 94 Max. :229.0 6/1/2004 : 85
## (Other): 61 (Other) :1693
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK
## Min. : 746 BWI: 145 Min. :0.00000 Min. :1.000
## 1st Qu.:2156 DCA:1372 1st Qu.:0.00000 1st Qu.:2.000
## Median :2385 IAD: 688 Median :0.00000 Median :4.000
## Mean :3817 Mean :0.01451 Mean :3.907
## 3rd Qu.:6155 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :7924 Max. :1.00000 Max. :7.000
##
## Flight.Status
## delayed: 428
## ontime :1777
##
##
##
##
##
sapply(delay, class)
## DEP_TIME Airline Destination Distance Flight_DATE
## "integer" "factor" "factor" "integer" "factor"
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## "integer" "factor" "integer" "integer" "factor"
head(delay)
## DEP_TIME Airline Destination Distance Flight_DATE Flight_NUM ORIGIN
## 1 1455 OH JFK 184 1/1/2004 5935 BWI
## 2 1640 DH JFK 213 1/1/2004 6155 DCA
## 3 1245 DH LGA 229 1/1/2004 7208 IAD
## 4 1715 DH LGA 229 1/1/2004 7215 IAD
## 5 1039 DH LGA 229 1/1/2004 7792 IAD
## 6 840 DH JFK 228 1/1/2004 7800 IAD
## Bad_Weather DAY_WEEK Flight.Status
## 1 0 4 ontime
## 2 0 4 ontime
## 3 0 4 ontime
## 4 0 4 ontime
## 5 0 4 ontime
## 6 0 4 ontime
Check the missing value
apply(delay, 2, function(col)sum(is.na(col)))
## DEP_TIME Airline Destination Distance Flight_DATE
## 0 0 1 0 0
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## 0 0 0 0 0
# There is one missing value under the "Destination" variable
Remove the missing value from the data
delay_no_missing <- delay[rowSums(is.na(delay)) == 0,]
apply(delay_no_missing, 2, function(col)sum(is.na(col)))
## DEP_TIME Airline Destination Distance Flight_DATE
## 0 0 0 0 0
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## 0 0 0 0 0
Remove duplicate from the data
delay_no_duplicate <- unique(delay_no_missing)
dim(delay_no_duplicate)
## [1] 2201 10
Convert the data type
a. Convert to factor: Weather, Day_week
Convert to date: Flight_DATE
sapply(delay_no_duplicate, class)
## DEP_TIME Airline Destination Distance Flight_DATE
## "integer" "factor" "factor" "integer" "factor"
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## "integer" "factor" "integer" "integer" "factor"
delay_no_duplicate$Bad_Weather = as.factor(delay_no_duplicate$Bad_Weather)
delay_no_duplicate$DAY_WEEK = as.factor(delay_no_duplicate$DAY_WEEK)
delay_no_duplicate$Flight_DATE = dmy(delay_no_duplicate$Flight_DATE)
sapply(delay_no_duplicate, class)
## DEP_TIME Airline Destination Distance Flight_DATE
## "integer" "factor" "factor" "integer" "Date"
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## "integer" "factor" "factor" "factor" "factor"
summary(delay_no_duplicate)
## DEP_TIME Airline Destination Distance
## Min. : 600 DH :551 EWR: 665 Min. :169.0
## 1st Qu.:1000 RU :408 JFK: 386 1st Qu.:213.0
## Median :1455 US :404 LGA:1150 Median :214.0
## Mean :1372 DL :388 Mean :211.9
## 3rd Qu.:1710 MQ :295 3rd Qu.:214.0
## Max. :2130 CO : 94 Max. :229.0
## (Other): 61
## Flight_DATE Flight_NUM ORIGIN Bad_Weather DAY_WEEK
## Min. :2004-01-01 Min. : 746 BWI: 145 0:2169 1:308
## 1st Qu.:2004-01-08 1st Qu.:2156 DCA:1370 1: 32 2:307
## Median :2004-01-16 Median :2385 IAD: 686 3:320
## Mean :2004-01-16 Mean :3815 4:372
## 3rd Qu.:2004-01-23 3rd Qu.:6155 5:391
## Max. :2004-01-31 Max. :7924 6:250
## 7:253
## Flight.Status
## delayed: 428
## ontime :1773
##
##
##
##
##
b. Convert Scheduled “DEP_TIME” to Date data type
head(delay_no_duplicate)
## DEP_TIME Airline Destination Distance Flight_DATE Flight_NUM ORIGIN
## 1 1455 OH JFK 184 2004-01-01 5935 BWI
## 2 1640 DH JFK 213 2004-01-01 6155 DCA
## 3 1245 DH LGA 229 2004-01-01 7208 IAD
## 4 1715 DH LGA 229 2004-01-01 7215 IAD
## 5 1039 DH LGA 229 2004-01-01 7792 IAD
## 6 840 DH JFK 228 2004-01-01 7800 IAD
## Bad_Weather DAY_WEEK Flight.Status
## 1 0 4 ontime
## 2 0 4 ontime
## 3 0 4 ontime
## 4 0 4 ontime
## 5 0 4 ontime
## 6 0 4 ontime
sapply(delay_no_duplicate, class)
## DEP_TIME Airline Destination Distance Flight_DATE
## "integer" "factor" "factor" "integer" "Date"
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## "integer" "factor" "factor" "factor" "factor"
Convert the time to HR and Min
Combine the Flight_DATE with DEP_TIME
delay_no_duplicate$date <- paste(delay_no_duplicate$Flight_DATE, delay_no_duplicate$DEP_TIME, sep = " ", collapse = NULL)
head(delay_no_duplicate$date)
## [1] "2004-01-01 14:55" "2004-01-01 16:40" "2004-01-01 12:45"
## [4] "2004-01-01 17:15" "2004-01-01 10:39" "2004-01-01 08:40"
class(delay_no_duplicate$date)
## [1] "character"
delay_no_duplicate$date <- as.Date(delay_no_duplicate$date, format = "%Y-%m-%d %H:%M", tz = "Asia/Kuala_Lumpur")
class(delay_no_duplicate$date)
## [1] "Date"
delay_no_duplicate$DEP_TIME = hm(delay_no_duplicate$DEP_TIME)
sapply(delay_no_duplicate, class)
## DEP_TIME Airline Destination Distance Flight_DATE
## "Period" "factor" "factor" "integer" "Date"
## Flight_NUM ORIGIN Bad_Weather DAY_WEEK Flight.Status
## "integer" "factor" "factor" "factor" "factor"
## date
## "Date"
Which day of the week sees the most delays?
delay_no_duplicate %>%
group_by(DAY_WEEK) %>%
filter(Flight.Status == "delayed") %>%
summarise(total_delay = n()) %>%
ggplot(aes(x= DAY_WEEK, y = total_delay)) +
geom_bar(stat = 'identity')

Which Destination sees the most delay?
delay_no_duplicate %>%
filter(Flight.Status == "delayed") %>%
group_by(Destination) %>%
summarise(total_delay = n()) %>%
ggplot(aes(x=Destination, y = total_delay)) +
geom_bar(stat = "identity")

Split the data to Train and test based on the 70% trainset of Flight.Status.
split <- sample.split(delay_no_duplicate$Flight.Status, SplitRatio = 0.7)
trainset <- subset(delay_no_duplicate, split == TRUE)
testset <- subset(delay_no_duplicate, split == FALSE)
dim(trainset)
## [1] 1541 11
dim(testset)
## [1] 660 11
Create model on Trainset, Remove the flight date as we use day week for prediction.
attach(trainset)
model <- glm(Flight.Status~ .-Flight_DATE, data = trainset, family = binomial)
summary(model)
##
## Call:
## glm(formula = Flight.Status ~ . - Flight_DATE, family = binomial,
## data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4468 0.3635 0.5099 0.6982 1.1529
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.055e+02 1.827e+02 1.672 0.094469 .
## DEP_TIME NA NA NA NA
## AirlineDH -6.050e+00 2.041e+00 -2.964 0.003038 **
## AirlineDL -5.600e-01 1.040e+00 -0.538 0.590397
## AirlineMQ -4.823e+00 1.800e+00 -2.680 0.007360 **
## AirlineOH -3.331e+00 1.637e+00 -2.034 0.041909 *
## AirlineRU -1.127e+00 6.065e-01 -1.858 0.063156 .
## AirlineUA 1.380e+00 8.340e-01 1.655 0.097872 .
## AirlineUS -6.026e-01 1.136e+00 -0.530 0.595920
## DestinationJFK 1.692e+01 1.317e+01 1.285 0.198930
## DestinationLGA 1.788e+01 1.408e+01 1.270 0.204084
## Distance -1.132e+00 8.842e-01 -1.281 0.200311
## Flight_NUM 1.008e-03 3.082e-04 3.271 0.001071 **
## ORIGINDCA 3.457e+01 2.649e+01 1.305 0.191921
## ORIGINIAD 5.014e+01 3.899e+01 1.286 0.198536
## Bad_Weather1 -1.763e+01 4.879e+02 -0.036 0.971182
## DAY_WEEK2 2.793e-01 2.575e-01 1.085 0.278014
## DAY_WEEK3 4.218e-01 2.561e-01 1.647 0.099502 .
## DAY_WEEK4 5.504e-01 2.461e-01 2.236 0.025353 *
## DAY_WEEK5 2.467e-01 2.323e-01 1.062 0.288319
## DAY_WEEK6 1.087e+00 3.074e-01 3.535 0.000408 ***
## DAY_WEEK7 -3.030e-01 2.420e-01 -1.252 0.210510
## date -9.250e-03 8.072e-03 -1.146 0.251794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1519.2 on 1540 degrees of freedom
## Residual deviance: 1348.4 on 1519 degrees of freedom
## AIC: 1392.4
##
## Number of Fisher Scoring iterations: 15
Predict the outcome by using the model generated by trainset
Set the threshold to the proportion of ontime flight status
x <- predict(model, type = 'response')
thresholdx <- sum(trainset$Flight.Status == 'ontime')/length(trainset$Flight.Status)
predicx <- ifelse(x>thresholdx, 'ontime', 'delayed')
table(trainset$Flight.Status, predicx)
## predicx
## delayed ontime
## delayed 205 95
## ontime 460 781
accuracyx <- mean(predicx == trainset$Flight.Status)
print(paste('Trainset Accuracy =', accuracyx))
## [1] "Trainset Accuracy = 0.63984425697599"
Remove the variable which is not significant
model2 <- glm(Flight.Status~ .-Flight_DATE -Bad_Weather -DEP_TIME, data = trainset, family = binomial)
probTrainset <- predict(model2, type = 'response')
threshold <- sum(trainset$Flight.Status == "ontime")/length(trainset$Flight.Status)
predictTrainset <- ifelse(probTrainset>threshold, "ontime", "delayed")
table(trainset$Flight.Status, predictTrainset)
## predictTrainset
## delayed ontime
## delayed 206 94
## ontime 473 768
accuracytrain <- mean(predictTrainset == trainset$Flight.Status)
print(paste('Trainset Accuracy = ', accuracytrain))
## [1] "Trainset Accuracy = 0.63205710577547"
Confusion matrix on Testset
Verify the testset by using the model generated by the trainset
probTestset <- predict(model2, newdata = testset, type = 'response')
predictTestset <- ifelse(probTestset>threshold, "ontime", "delayed")
table(testset$Flight.Status, predictTestset)
## predictTestset
## delayed ontime
## delayed 94 34
## ontime 218 314
accuracytest <- mean(predictTestset == testset$Flight.Status)
print(paste('Testset Accuracy= ', accuracytest))
## [1] "Testset Accuracy= 0.618181818181818"