Load the library function

library(tidyverse)
library(lubridate)
library(caTools)

Read the data from csv

delay = read.csv("Delay_v3.csv")

Understand and explore the data

names(delay)
##  [1] "DEP_TIME"      "Airline"       "Destination"   "Distance"     
##  [5] "Flight_DATE"   "Flight_NUM"    "ORIGIN"        "Bad_Weather"  
##  [9] "DAY_WEEK"      "Flight.Status"
dim(delay)
## [1] 2205   10
summary(delay)
##     DEP_TIME       Airline    Destination    Distance        Flight_DATE  
##  Min.   : 600   DH     :553   EWR : 668   Min.   :169.0   14/1/2004:  86  
##  1st Qu.:1000   RU     :410   JFK : 386   1st Qu.:213.0   22/1/2004:  86  
##  Median :1455   US     :404   LGA :1150   Median :214.0   13/1/2004:  85  
##  Mean   :1372   DL     :388   NA's:   1   Mean   :211.9   20/1/2004:  85  
##  3rd Qu.:1710   MQ     :295               3rd Qu.:214.0   21/1/2004:  85  
##  Max.   :2130   CO     : 94               Max.   :229.0   6/1/2004 :  85  
##                 (Other): 61                               (Other)  :1693  
##    Flight_NUM   ORIGIN      Bad_Weather         DAY_WEEK    
##  Min.   : 746   BWI: 145   Min.   :0.00000   Min.   :1.000  
##  1st Qu.:2156   DCA:1372   1st Qu.:0.00000   1st Qu.:2.000  
##  Median :2385   IAD: 688   Median :0.00000   Median :4.000  
##  Mean   :3817              Mean   :0.01451   Mean   :3.907  
##  3rd Qu.:6155              3rd Qu.:0.00000   3rd Qu.:5.000  
##  Max.   :7924              Max.   :1.00000   Max.   :7.000  
##                                                             
##  Flight.Status 
##  delayed: 428  
##  ontime :1777  
##                
##                
##                
##                
## 
sapply(delay, class)
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##     "integer"      "factor"      "factor"     "integer"      "factor" 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##     "integer"      "factor"     "integer"     "integer"      "factor"
head(delay)
##   DEP_TIME Airline Destination Distance Flight_DATE Flight_NUM ORIGIN
## 1     1455      OH         JFK      184    1/1/2004       5935    BWI
## 2     1640      DH         JFK      213    1/1/2004       6155    DCA
## 3     1245      DH         LGA      229    1/1/2004       7208    IAD
## 4     1715      DH         LGA      229    1/1/2004       7215    IAD
## 5     1039      DH         LGA      229    1/1/2004       7792    IAD
## 6      840      DH         JFK      228    1/1/2004       7800    IAD
##   Bad_Weather DAY_WEEK Flight.Status
## 1           0        4        ontime
## 2           0        4        ontime
## 3           0        4        ontime
## 4           0        4        ontime
## 5           0        4        ontime
## 6           0        4        ontime

Check the missing value

apply(delay, 2, function(col)sum(is.na(col)))
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##             0             0             1             0             0 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##             0             0             0             0             0
  # There is one missing value under the "Destination" variable

Remove the missing value from the data

delay_no_missing <- delay[rowSums(is.na(delay)) == 0,]
apply(delay_no_missing, 2, function(col)sum(is.na(col)))
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##             0             0             0             0             0 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##             0             0             0             0             0

Remove duplicate from the data

delay_no_duplicate <- unique(delay_no_missing)
dim(delay_no_duplicate)
## [1] 2201   10

Convert the data type

a. Convert to factor: Weather, Day_week

Convert to date: Flight_DATE

sapply(delay_no_duplicate, class)
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##     "integer"      "factor"      "factor"     "integer"      "factor" 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##     "integer"      "factor"     "integer"     "integer"      "factor"
delay_no_duplicate$Bad_Weather = as.factor(delay_no_duplicate$Bad_Weather)
delay_no_duplicate$DAY_WEEK = as.factor(delay_no_duplicate$DAY_WEEK)
delay_no_duplicate$Flight_DATE = dmy(delay_no_duplicate$Flight_DATE)
sapply(delay_no_duplicate, class)
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##     "integer"      "factor"      "factor"     "integer"        "Date" 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##     "integer"      "factor"      "factor"      "factor"      "factor"
summary(delay_no_duplicate)
##     DEP_TIME       Airline    Destination    Distance    
##  Min.   : 600   DH     :551   EWR: 665    Min.   :169.0  
##  1st Qu.:1000   RU     :408   JFK: 386    1st Qu.:213.0  
##  Median :1455   US     :404   LGA:1150    Median :214.0  
##  Mean   :1372   DL     :388               Mean   :211.9  
##  3rd Qu.:1710   MQ     :295               3rd Qu.:214.0  
##  Max.   :2130   CO     : 94               Max.   :229.0  
##                 (Other): 61                              
##   Flight_DATE           Flight_NUM   ORIGIN     Bad_Weather DAY_WEEK
##  Min.   :2004-01-01   Min.   : 746   BWI: 145   0:2169      1:308   
##  1st Qu.:2004-01-08   1st Qu.:2156   DCA:1370   1:  32      2:307   
##  Median :2004-01-16   Median :2385   IAD: 686               3:320   
##  Mean   :2004-01-16   Mean   :3815                          4:372   
##  3rd Qu.:2004-01-23   3rd Qu.:6155                          5:391   
##  Max.   :2004-01-31   Max.   :7924                          6:250   
##                                                             7:253   
##  Flight.Status 
##  delayed: 428  
##  ontime :1773  
##                
##                
##                
##                
## 

b. Convert Scheduled “DEP_TIME” to Date data type

head(delay_no_duplicate)
##   DEP_TIME Airline Destination Distance Flight_DATE Flight_NUM ORIGIN
## 1     1455      OH         JFK      184  2004-01-01       5935    BWI
## 2     1640      DH         JFK      213  2004-01-01       6155    DCA
## 3     1245      DH         LGA      229  2004-01-01       7208    IAD
## 4     1715      DH         LGA      229  2004-01-01       7215    IAD
## 5     1039      DH         LGA      229  2004-01-01       7792    IAD
## 6      840      DH         JFK      228  2004-01-01       7800    IAD
##   Bad_Weather DAY_WEEK Flight.Status
## 1           0        4        ontime
## 2           0        4        ontime
## 3           0        4        ontime
## 4           0        4        ontime
## 5           0        4        ontime
## 6           0        4        ontime
sapply(delay_no_duplicate, class)
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##     "integer"      "factor"      "factor"     "integer"        "Date" 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##     "integer"      "factor"      "factor"      "factor"      "factor"

Convert the time to correct format. 840 to 0840 format then to 08:40 format

delay_no_duplicate$DEP_TIME <- sprintf("%04d",delay_no_duplicate$DEP_TIME)
head(delay_no_duplicate$DEP_TIME)
## [1] "1455" "1640" "1245" "1715" "1039" "0840"
class(delay_no_duplicate$DEP_TIME)
## [1] "character"
delay_no_duplicate$DEP_TIME = paste0(substr(delay_no_duplicate$DEP_TIME,1,2),":",substr(delay_no_duplicate$DEP_TIME,3,4))
head(delay_no_duplicate$DEP_TIME)
## [1] "14:55" "16:40" "12:45" "17:15" "10:39" "08:40"
class(delay_no_duplicate$DEP_TIME)
## [1] "character"

Convert the time to HR and Min

Combine the Flight_DATE with DEP_TIME

delay_no_duplicate$date <- paste(delay_no_duplicate$Flight_DATE, delay_no_duplicate$DEP_TIME, sep = " ", collapse = NULL)
head(delay_no_duplicate$date)
## [1] "2004-01-01 14:55" "2004-01-01 16:40" "2004-01-01 12:45"
## [4] "2004-01-01 17:15" "2004-01-01 10:39" "2004-01-01 08:40"
class(delay_no_duplicate$date)
## [1] "character"
delay_no_duplicate$date <- as.Date(delay_no_duplicate$date, format = "%Y-%m-%d %H:%M", tz = "Asia/Kuala_Lumpur")
class(delay_no_duplicate$date)
## [1] "Date"
delay_no_duplicate$DEP_TIME = hm(delay_no_duplicate$DEP_TIME)
sapply(delay_no_duplicate, class)
##      DEP_TIME       Airline   Destination      Distance   Flight_DATE 
##      "Period"      "factor"      "factor"     "integer"        "Date" 
##    Flight_NUM        ORIGIN   Bad_Weather      DAY_WEEK Flight.Status 
##     "integer"      "factor"      "factor"      "factor"      "factor" 
##          date 
##        "Date"

Which day of the week sees the most delays?

delay_no_duplicate %>%
  group_by(DAY_WEEK) %>%
  filter(Flight.Status == "delayed") %>%
  summarise(total_delay = n()) %>%
  ggplot(aes(x= DAY_WEEK, y = total_delay)) +
  geom_bar(stat = 'identity')

Which Destination sees the most delay?

delay_no_duplicate %>%
  filter(Flight.Status == "delayed") %>%
  group_by(Destination) %>%
  summarise(total_delay = n()) %>%
  ggplot(aes(x=Destination, y = total_delay)) +
  geom_bar(stat = "identity")

Predict flights delay by using logistic regression and evaluate model performance

head(delay_no_duplicate)
##     DEP_TIME Airline Destination Distance Flight_DATE Flight_NUM ORIGIN
## 1 14H 55M 0S      OH         JFK      184  2004-01-01       5935    BWI
## 2 16H 40M 0S      DH         JFK      213  2004-01-01       6155    DCA
## 3 12H 45M 0S      DH         LGA      229  2004-01-01       7208    IAD
## 4 17H 15M 0S      DH         LGA      229  2004-01-01       7215    IAD
## 5 10H 39M 0S      DH         LGA      229  2004-01-01       7792    IAD
## 6  8H 40M 0S      DH         JFK      228  2004-01-01       7800    IAD
##   Bad_Weather DAY_WEEK Flight.Status       date
## 1           0        4        ontime 2004-01-01
## 2           0        4        ontime 2004-01-01
## 3           0        4        ontime 2004-01-01
## 4           0        4        ontime 2004-01-01
## 5           0        4        ontime 2004-01-01
## 6           0        4        ontime 2004-01-01
summary(delay_no_duplicate)
##     DEP_TIME                            Airline    Destination
##  Min.   :6H 0M 0S                    DH     :551   EWR: 665   
##  1st Qu.:10H 0M 0S                   RU     :408   JFK: 386   
##  Median :14H 55M 0S                  US     :404   LGA:1150   
##  Mean   :13H 51M 17.6101771921822S   DL     :388              
##  3rd Qu.:17H 10M 0S                  MQ     :295              
##  Max.   :21H 30M 0S                  CO     : 94              
##                                      (Other): 61              
##     Distance      Flight_DATE           Flight_NUM   ORIGIN    
##  Min.   :169.0   Min.   :2004-01-01   Min.   : 746   BWI: 145  
##  1st Qu.:213.0   1st Qu.:2004-01-08   1st Qu.:2156   DCA:1370  
##  Median :214.0   Median :2004-01-16   Median :2385   IAD: 686  
##  Mean   :211.9   Mean   :2004-01-16   Mean   :3815             
##  3rd Qu.:214.0   3rd Qu.:2004-01-23   3rd Qu.:6155             
##  Max.   :229.0   Max.   :2004-01-31   Max.   :7924             
##                                                                
##  Bad_Weather DAY_WEEK Flight.Status       date           
##  0:2169      1:308    delayed: 428   Min.   :2004-01-01  
##  1:  32      2:307    ontime :1773   1st Qu.:2004-01-08  
##              3:320                   Median :2004-01-16  
##              4:372                   Mean   :2004-01-16  
##              5:391                   3rd Qu.:2004-01-23  
##              6:250                   Max.   :2004-01-31  
##              7:253

Split the data to Train and test based on the 70% trainset of Flight.Status.

split <- sample.split(delay_no_duplicate$Flight.Status, SplitRatio = 0.7)
trainset <- subset(delay_no_duplicate, split == TRUE)
testset <- subset(delay_no_duplicate, split == FALSE)
dim(trainset)
## [1] 1541   11
dim(testset)
## [1] 660  11

Create model on Trainset, Remove the flight date as we use day week for prediction.

attach(trainset)
model <- glm(Flight.Status~ .-Flight_DATE,  data = trainset, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Flight.Status ~ . - Flight_DATE, family = binomial, 
##     data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4468   0.3635   0.5099   0.6982   1.1529  
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     3.055e+02  1.827e+02   1.672 0.094469 .  
## DEP_TIME               NA         NA      NA       NA    
## AirlineDH      -6.050e+00  2.041e+00  -2.964 0.003038 ** 
## AirlineDL      -5.600e-01  1.040e+00  -0.538 0.590397    
## AirlineMQ      -4.823e+00  1.800e+00  -2.680 0.007360 ** 
## AirlineOH      -3.331e+00  1.637e+00  -2.034 0.041909 *  
## AirlineRU      -1.127e+00  6.065e-01  -1.858 0.063156 .  
## AirlineUA       1.380e+00  8.340e-01   1.655 0.097872 .  
## AirlineUS      -6.026e-01  1.136e+00  -0.530 0.595920    
## DestinationJFK  1.692e+01  1.317e+01   1.285 0.198930    
## DestinationLGA  1.788e+01  1.408e+01   1.270 0.204084    
## Distance       -1.132e+00  8.842e-01  -1.281 0.200311    
## Flight_NUM      1.008e-03  3.082e-04   3.271 0.001071 ** 
## ORIGINDCA       3.457e+01  2.649e+01   1.305 0.191921    
## ORIGINIAD       5.014e+01  3.899e+01   1.286 0.198536    
## Bad_Weather1   -1.763e+01  4.879e+02  -0.036 0.971182    
## DAY_WEEK2       2.793e-01  2.575e-01   1.085 0.278014    
## DAY_WEEK3       4.218e-01  2.561e-01   1.647 0.099502 .  
## DAY_WEEK4       5.504e-01  2.461e-01   2.236 0.025353 *  
## DAY_WEEK5       2.467e-01  2.323e-01   1.062 0.288319    
## DAY_WEEK6       1.087e+00  3.074e-01   3.535 0.000408 ***
## DAY_WEEK7      -3.030e-01  2.420e-01  -1.252 0.210510    
## date           -9.250e-03  8.072e-03  -1.146 0.251794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1519.2  on 1540  degrees of freedom
## Residual deviance: 1348.4  on 1519  degrees of freedom
## AIC: 1392.4
## 
## Number of Fisher Scoring iterations: 15

Predict the outcome by using the model generated by trainset

Set the threshold to the proportion of ontime flight status

x <- predict(model, type = 'response')
thresholdx <- sum(trainset$Flight.Status == 'ontime')/length(trainset$Flight.Status)
predicx <- ifelse(x>thresholdx, 'ontime', 'delayed')
table(trainset$Flight.Status, predicx)
##          predicx
##           delayed ontime
##   delayed     205     95
##   ontime      460    781
accuracyx <- mean(predicx == trainset$Flight.Status)
print(paste('Trainset Accuracy =', accuracyx))
## [1] "Trainset Accuracy = 0.63984425697599"

Remove the variable which is not significant

model2 <- glm(Flight.Status~ .-Flight_DATE -Bad_Weather -DEP_TIME, data = trainset, family = binomial)
probTrainset <- predict(model2, type = 'response')
threshold <- sum(trainset$Flight.Status == "ontime")/length(trainset$Flight.Status)
predictTrainset <- ifelse(probTrainset>threshold, "ontime", "delayed")
table(trainset$Flight.Status, predictTrainset)
##          predictTrainset
##           delayed ontime
##   delayed     206     94
##   ontime      473    768
accuracytrain <- mean(predictTrainset == trainset$Flight.Status)
print(paste('Trainset Accuracy = ', accuracytrain))
## [1] "Trainset Accuracy =  0.63205710577547"

Confusion matrix on Testset

Verify the testset by using the model generated by the trainset

probTestset <- predict(model2, newdata = testset, type = 'response')
predictTestset <- ifelse(probTestset>threshold, "ontime", "delayed")
table(testset$Flight.Status, predictTestset)
##          predictTestset
##           delayed ontime
##   delayed      94     34
##   ontime      218    314
accuracytest <- mean(predictTestset == testset$Flight.Status)
print(paste('Testset Accuracy= ', accuracytest))
## [1] "Testset Accuracy=  0.618181818181818"