Part 1: Read the data..

# reading external data and storing into a dataframe called "airline.df"
airline.df <- read.csv("Fdelay.csv")
air_line.df <- airline.df[,-c(1,2)]

Part 2: Column names

# Display the column names
colnames(airline.df)
##  [1] "X"                    "V1"                   "FlightDate"          
##  [4] "Reporting_Airline"    "Origin"               "Dest"                
##  [7] "CRSDepTime"           "DepTime"              "DepDelay"            
## [10] "DepDelayMinutes"      "DepDel15"             "DepartureDelayGroups"
## [13] "DepTimeBlk"           "CRSArrTime"           "ArrTime"             
## [16] "ArrDelay"             "ArrDelayMinutes"      "ArrDel15"            
## [19] "ArrivalDelayGroups"   "ArrTimeBlk"           "CRSElapsedTime"      
## [22] "ActualElapsedTime"    "AirTime"              "Distance"            
## [25] "WEEKEND"              "PMDEP"                "PMARR"               
## [28] "Southwest"            "American"             "Delta"               
## [31] "United"               "Alaska"               "Jetblue"             
## [34] "Skywest"              "Others"               "DepStatus"           
## [37] "ArrStatus"            "airline"              "timewindowdep"       
## [40] "timewindowarr"        "daywindow"

Part 3: Data Dimensions

# Display the Data Dimensions
dim(airline.df)
## [1] 1774818      41

Part 3A: Check missing values in data

#Check missing values if any. 
#Missing value treatment
sum(is.na(airline.df))
## [1] 0
# Sum of missing values is 0 so no missing values

Part 4: Descriptive Statistics of the dataframe

Below is a descriptive analysis for numerical or binary variables.

We identify that outlier treatment is needed in data. For example, min air time is 7 minutes, min distance is 31 miles. These numbers don’t make practical sense.

Data shows that this sample has majority market share by Southwest airlines(19%) follwoed by American and Delta (13% each).

Data shows that majority flights arrive and depart b/w 12 PM-11:59 PM (70% AND 58% arrival and departure)

library(psych)
describe(air_line.df[,c(8:10,15,16,17,19,20,21,22:33)])[,2:6]
##                            n   mean     sd min  max
## DepDelayMinutes      1774818  11.69  41.45   0 2109
## DepDel15             1774818   0.17   0.38   0    1
## DepartureDelayGroups 1774818  -0.06   2.03  -2   12
## ArrDelayMinutes      1774818  12.14  41.51   0 2153
## ArrDel15             1774818   0.18   0.39   0    1
## ArrivalDelayGroups   1774818  -0.26   2.18  -2   12
## CRSElapsedTime       1774818 140.85  72.91 -99  703
## ActualElapsedTime    1774818 136.25  72.50  14  723
## AirTime              1774818 111.31  70.74   7  695
## Distance             1774818 797.11 593.98  31 4983
## WEEKEND              1774818   0.27   0.44   0    1
## PMDEP                1774818   0.58   0.49   0    1
## PMARR                1774818   0.70   0.46   0    1
## Southwest            1774818   0.19   0.39   0    1
## American             1774818   0.13   0.33   0    1
## Delta                1774818   0.13   0.34   0    1
## United               1774818   0.09   0.28   0    1
## Alaska               1774818   0.04   0.18   0    1
## Jetblue              1774818   0.04   0.20   0    1
## Skywest              1774818   0.11   0.31   0    1
## Others               1774818   0.28   0.45   0    1

Part 5: Percentage of the flights

This shows that 18% of flights get delayed

Histogram shows that majority of flights get delayed by 15-30 minutes

attach(air_line.df)
tab <- table(ArrDel15)
PercentProportion <- round(prop.table(tab)*100,1) 
PercentProportion
## ArrDel15
##    0    1 
## 81.9 18.1
hist(ArrDelayMinutes[which(ArrDelayMinutes<quantile(ArrDelayMinutes,.99))],xlim=c(15,180),ylim=c(0,200000), breaks=15, main="Distribution of flight delay frequency by no. of minutes (>15 min. delay cases)", xlab="Minutes of delay in arrival")

Part 6: Bar Chart for % of the delayed flights

bar <- barplot(PercentProportion, col = c("skyblue","pink"),
                       xlab = "Delay",ylab = "Percentage (%)",
                       main = "% Of flights delayed / Not delayed",legend = rownames(PercentProportion))
text(bar, 0, PercentProportion,cex=1,pos=3)

Part 7: Percentage of the candidates Joined / Did Not Join the Company, Split by airline

Flights with carrier codes F9 and B6 have most delays

tab <- table(Reporting_Airline,ArrDel15)
PercentProportion <- round(prop.table(tab,1)*100,1) 
PercentProportion
##                  ArrDel15
## Reporting_Airline    0    1
##                9E 82.0 18.0
##                AA 80.9 19.1
##                AS 82.0 18.0
##                B6 75.4 24.6
##                DL 87.7 12.3
##                EV 75.1 24.9
##                F9 71.8 28.2
##                G4 80.9 19.1
##                HA 90.6  9.4
##                MQ 80.8 19.2
##                NK 85.6 14.4
##                OH 83.6 16.4
##                OO 81.3 18.7
##                UA 80.7 19.3
##                WN 82.1 17.9
##                YV 76.9 23.1
##                YX 82.1 17.9

Part 8: Bar Chart for % Of flights, which got delayed split by carrier

temp <- PercentProportion[,2]
bar <- barplot(temp,xlab = "Flight carrier code",ylab = "Percentage (%)",
            main = "Bar chart for % Of flights, which got delayed split by carrier",legend = rownames(temp), col="pink")
text(bar, 0, temp,cex=1,pos=3)

Part 9:

#attaching dataframe
attach(airline.df)
## The following objects are masked from air_line.df:
## 
##     ActualElapsedTime, airline, AirTime, Alaska, American,
##     ArrDel15, ArrDelay, ArrDelayMinutes, ArrivalDelayGroups,
##     ArrStatus, ArrTime, ArrTimeBlk, CRSArrTime, CRSDepTime,
##     CRSElapsedTime, daywindow, Delta, DepartureDelayGroups,
##     DepDel15, DepDelay, DepDelayMinutes, DepStatus, DepTime,
##     DepTimeBlk, Dest, Distance, FlightDate, Jetblue, Origin,
##     Others, PMARR, PMDEP, Reporting_Airline, Skywest, Southwest,
##     timewindowarr, timewindowdep, United, WEEKEND
# creating table for counts
counts <- table(ArrDel15,Reporting_Airline)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
                       xlab = "Reporting_Airline",ylab = "ArrDel15",
                       main = "Delay Vs Airline",beside=TRUE, 
                       legend = rownames(counts))

# WN has the highest delay while HA has the lowest delay 

Part 10:

# creating table for counts
counts <- table(ArrDel15,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
                       xlab = "WEEKEND",ylab = "ArrDel15",
                       main = "Delay vs Weekend",beside=TRUE, 
                       legend = rownames(counts))

#Delay in Weekday is higher than Weekend

Part 11:

# creating table for counts
counts <- table(PMDEP,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
                       xlab = "PMDEP",ylab = "ArrDel15",
                       main = "Delay vs Departure time",beside=TRUE, 
                       legend = rownames(counts))

#Delay is higher when flight departs in AM than when it departs in PM

Part 12:

# creating table for counts
counts <- table(PMARR,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
                       xlab = "PMARR",ylab = "ArrDel15",
                       main = "Delay vs Arrival time",beside=TRUE, 
                       legend = rownames(counts))

#Delay is higher when flight arrives in AM than when it arrives in PM

Part 13:

boxplot( AirTime ~ ArrDel15, main = "AirTime vs Delay", col=(c("red","white")))

#The number of outliers is high in the above comparison. Only conclusion that can be inferred is when the airtime is very high i.e. above 600, delay is lesser as there are lesser points at that region

Part 14:

Creating the data table

library(data.table)
airline.dt=as.data.table(airline.df)
airline.dt[, ArrDel15 := as.factor(ArrDel15)]

Part 15:

Barplot of delayed departure leading to delayed/non -delayed arrival

bpa <- barplot(prop.table(tab[2,])*100, col = "lightblue", xlab = "Arrival delay", ylab= "Percentage of delayed departure", main = "Barplot of delayed departure leading to delayed/non -delayed arrival",args.legend = list(title = "Joined", x = "topright", cex = .7),ylim = c(0, 50) )

text(bpa, 0, round(prop.table(tab[2,])*100, 1),cex=1,pos=3) 

#Insight: 
# 79.7% of flight departing with delay resulted in delayed arrival
# 20.3% of flights departing with delay arrived on time

Part 16:

Barplot of delayed arrival split by delayed/non-delayed departure

barplot(prop.table(tab[,2])*100, col = "lightblue", xlab = "Departure delay", ylab= "Percentage of delayed arrival", main = "Barplot of delayed arrival split by delayed/non-delayed departure", args.legend = list(title = "Joined", x = "topright", cex = .7),ylim = c(0, 50) )

text(bpa, 0, round(prop.table(tab[,2])*100, 1),cex=1,pos=3) 

#Insight: 74.4% of delayed arrival was preceeded by delayed departure
# 25.6% of delayed arrival was preceeded by non-delayed departed

Part 17:

Average flight distance,departure delay, arrival delay; split by delay/non-delay flight status

dt1 <- airline.dt[,.(N=.N,AverageDistance = mean(Distance),AverageDepartureDelay = mean(DepDelayMinutes), AverageArrivalDelay = mean(ArrDelayMinutes)),by=.(ArrivalDelay = airline.dt$ArrDel15)]
dt1
##    ArrivalDelay       N AverageDistance AverageDepartureDelay
## 1:            0 1452702        794.4130              1.887418
## 2:            1  322116        809.2458             55.921997
##    AverageArrivalDelay
## 1:            1.345451
## 2:           60.826413

Average difference between actual and CRS elapsed time split by arrival delay/non-delay flight status

dt2 <- airline.dt[,.(N=.N,MeanDifferenceActualAndElapsedTime= mean(abs(CRSElapsedTime-ActualElapsedTime))),by=.(ArrivalDelay = airline.dt$ArrDel15)]
dt2
##    ArrivalDelay       N MeanDifferenceActualAndElapsedTime
## 1:            0 1452702                           9.816577
## 2:            1  322116                          14.716056

Delayed arrival and delayed departure

tab <- table(DepartureDelay = DepDel15,ArrivalDelay = ArrDel15)
addmargins(tab)
##               ArrivalDelay
## DepartureDelay       0       1     Sum
##            0   1391657   82554 1474211
##            1     61045  239562  300607
##            Sum 1452702  322116 1774818

Part 18:

Percentage of delayed arrival & delayed departure

proptable <- round(prop.table(tab)*100,2)
proptable
##               ArrivalDelay
## DepartureDelay     0     1
##              0 78.41  4.65
##              1  3.44 13.50
#Insight: 
# 78.41% flights departed and arrived without delay.
# 4.65% flights departed without delay but arrived with delay
# 3.44% flights departed with delay but arrived on time
# 13.50% flights departed and arrived with delay

Percentage of delayed departure leading to delayed/non -delayed arrival

round(prop.table(tab[2,])*100,2)
##     0     1 
## 20.31 79.69
#Insight: 
# 79.69% of flight departing with delay resulted in delayed arrival
# 20.31% of flights departing with delay arrived on time

Percentage of delayed arrival preceeded by delayed/non -delayed departure

round(prop.table(tab[,2])*100,2)
##     0     1 
## 25.63 74.37
#Insight: 74.37% of delayed arrival was preceeded by delayed departure
# 25.63% of delayed arrival was preceeded by non-delayed departed

Part 19:

Density plot of flights delay/non-delay status w.r.t CRS Departure time

cdplot(round(airline.dt$CRSDepTime/100,0),airline.dt$ArrDel15,xlab = "CRS Departure time (hh)",ylab="Arrival Delay",main = "Density plot of flights delay/non-delay status w.r.t CRS Departure time", col=c("lightblue","red"))

#Insight : Density of delayed flight at any point of time increases steadily for flight deaprting after early morning hours till early night hours. Density of delayed flight is minimum for those flights departing around 5 am.

Density plot of flights delay/non-delay status w.r.t CRS Arrival time

cdplot(round(airline.dt$CRSArrTime/100,0),airline.dt$ArrDel15,xlab = "CRS Arrival time (hh)",ylab="Arrival Delay",main = "Density plot of flights delay/non-delay status w.r.t CRS Arrival time", col=c("lightblue","red"))

#Insight : Density of delayed flight at any point of time increases steadily for flight arriving after morning hours till early morning of next day and decrease in trend occurs around 2am. Density of delayed flight is minimum for those flights arriving around 8 am.

Part 20:

round(prop.table(ftable(Reporting_Airline + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
##          Reporting_Airline    9E          AA          AS          B6          DL          EV          F9          G4          HA          MQ          NK          OH          OO          UA          WN          YV          YX      
##          DepDel15              0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1     0     1
## ArrDel15                                                                                                                                                                                                                              
## 0                          95.20 15.84 92.94 19.74 92.38 22.07 93.31 15.86 95.87 28.87 91.99  9.44 92.20 19.42 95.28 16.10 95.85 18.63 93.21 15.63 95.53 21.11 95.82 19.65 93.88 14.92 93.00 16.83 96.54 26.98 92.50 10.15 93.20 16.23
## 1                           4.80 84.16  7.06 80.26  7.62 77.93  6.69 84.14  4.13 71.13  8.01 90.56  7.80 80.58  4.72 83.90  4.15 81.37  6.79 84.37  4.47 78.89  4.18 80.35  6.12 85.08  7.00 83.17  3.46 73.02  7.50 89.85  6.80 83.77
#IF departure is on time then arrival is probably on time
#If departure is delayed then arrival is delayed
#For EV model Arrival is delayed highest even if departure is on time
round(prop.table(ftable(airline + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
##          airline  Alaska       American       Delta       Frontier       Hawaiian       Jetblue       Others       Skywest       Southwest       Spirit       United      
##          DepDel15      0     1        0     1     0     1        0     1        0     1       0     1      0     1       0     1         0     1      0     1      0     1
## ArrDel15                                                                                                                                                                  
## 0                  92.38 22.07    92.94 19.74 95.87 28.87    92.20 19.42    95.85 18.63   93.31 15.86  93.86 14.80   93.88 14.92     96.54 26.98  95.53 21.11  93.00 16.83
## 1                   7.62 77.93     7.06 80.26  4.13 71.13     7.80 80.58     4.15 81.37    6.69 84.14   6.14 85.20    6.12 85.08      3.46 73.02   4.47 78.89   7.00 83.17
#Southewest is performing best i.e Departurre and arrival is on time
#For Delta and Southwest, Even if Departure is delayed they are try to arrive on time
#For Skywest, if flight is delayed in departure it has highest probability to arrive late
round(prop.table(ftable(daywindow + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
##          daywindow Weekday       Weekend      
##          DepDel15        0     1       0     1
## ArrDel15                                      
## 0                    94.12 19.54   95.17 22.61
## 1                     5.88 80.46    4.83 77.39
# on Weekday, Flights try to recover time lost in departure delay
round(prop.table(ftable(daywindow + DepDel15 ~ ArrDel15, data = airline.df),1)*100,2)
##          daywindow Weekday       Weekend      
##          DepDel15        0     1       0     1
## ArrDel15                                      
## 0                    69.78  3.03   26.02  1.17
## 1                    19.67 56.36    5.96 18.02
#On weekdays, delays are highest
round(prop.table(ftable(daywindow + airline ~ ArrDel15, data = airline.df),2)*100,2)
##          daywindow Weekday                                                                                 Weekend                                                                                
##          airline    Alaska American Delta Frontier Hawaiian Jetblue Others Skywest Southwest Spirit United  Alaska American Delta Frontier Hawaiian Jetblue Others Skywest Southwest Spirit United
## ArrDel15                                                                                                                                                                                          
## 0                    80.67    80.13 87.10    72.43    90.31   74.55  79.57   81.21     81.62  84.63  80.05   85.65    82.81 89.58    70.32    91.27   77.63  83.40   81.54     83.36  87.70  82.50
## 1                    19.33    19.87 12.90    27.57     9.69   25.45  20.43   18.79     18.38  15.37  19.95   14.35    17.19 10.42    29.68     8.73   22.37  16.60   18.46     16.64  12.30  17.50
#on an avg, Weekend has less arrival delays

Part 21:

Mean plot for flight distance (miles), split by delay/non-delay flight status

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(Distance ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight distance(in miles) , split by delay/non-delay status",xlab = "Arrival Delay", ylab = "Distance (in miles)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

#Insight: Mean distance is almost similar for delayed and non-delayed flights

Mean plot for flight arrival delay (in minutes), split by delay/non-delay flight status

library(gplots)
plotmeans(ArrDelayMinutes ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight arrival delay(in minutes) , split by delay/non-delay flight status",xlab = "Arrival Delay",ylab="Arrival delay (in minutes)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

#Insight: Flight delayed are arriving late by a mean of around  60 minutes

Mean plot for flight departure delay (in minutes), split by delay/non-delay flight status

library(gplots)
plotmeans(DepDelayMinutes ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight Departure delay(in minutes) , split by delay/non-delay flight status", xlab = "Arrival Delay", ylab = "Departure delay (in minutes)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

#Insight: Flight arriving late are delayed in departure with a mean of around 55 minutes

Mean plot for difference between actual and CRS elapsed time split by arrival delay/non-delay flight status

library(gplots)
plotmeans(abs(CRSElapsedTime-ActualElapsedTime) ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for difference between actual and CRS elapsed time split by arrival delay/non-delay flight status", xlab="Arrival Delay", ylab = "Difference between actual and CRS elapsed time(in minutes")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

#Insight: Mean difference between actual and CRS elapsed time is more for delayed flights by a quantum 5 minutes 

Part 22:

Analyzing Delay Distribution

quantile(airline.df$ArrDelayMinutes, robs = seq(0, 1, 0.25))
##   0%  25%  50%  75% 100% 
##    0    0    0    7 2153
quantile(airline.df$ArrDelayMinutes, c(0.99, 0.999))
##   99% 99.9% 
##   169   503
boxplot(airline.df$ArrDelayMinutes)

#As we can see, there are a lot of outliers in this arrival delay time  distribution. 75% probability of distribution is at 7 mins and 99% is at 169. 99.9% is at 503. So we will take everything less than 169 as normal and remove any delay time > 169 as outlier

Part 23:

Outlier Treatment

#Removing extremes with value > 169
extreme_delay<- airline.df[which(airline.df$ArrDelayMinutes>170),]
message("No of outliers: ", dim(extreme_delay)[1])
## No of outliers: 17431
non_extreme_delay<- airline.df[which(airline.df$ArrDelayMinutes<170),]
message("Actual Working Set length: ", dim(non_extreme_delay)[1])
## Actual Working Set length: 1757175
boxplot(non_extreme_delay$ArrDelayMinutes)

Part 24:

Plotting Departure Delay time vs Arrival Delay Time

plot(non_extreme_delay$DepDelayMinutes, non_extreme_delay$ArrDelayMinutes, xlab = "Departure Delay Mins", ylab = "Arrival Delay Mins")

#As expected, from graph it can be seen that if there is a delay in departure of flight, there will be delay in arrival of flight. And it is more or less a linear trend.  

Plot for distance vs delay time

plot(non_extreme_delay$Distance, non_extreme_delay$ArrDelayMinutes,xlab = "Distance", ylab = "Delay Minutes")

#No Clear Pattern between distance and delays. Low Distance area seems to be more darker because of more no of flights with distance < 3000 km

non_extreme_delay$DepDel15 = factor(non_extreme_delay$DepDel15)
boxplot(Distance ~ ArrDel15  , data = non_extreme_delay,
                main = "Boxplot for Distance grouped by Delay greater than 15 mins/Delay less than 15",
                 col=(c("gray","lightblue")))

#Distance distribution seems to be same for both delay > 15 mins and delay < 15 mins. So there is not much impact of distance on delays

Part 25:

Boxplot for delay vs origin airport

str(non_extreme_delay$Origin)
##  Factor w/ 350 levels "ABE","ABI","ABQ",..: 183 183 183 183 183 183 183 215 215 215 ...
#Since there are 350 levels for Origin, boxplot will drop some levels as it cannot show all Origin. No clear insight can be drawn from the boxplot.
par(las=2)
boxplot(ArrDelayMinutes ~ Origin , data = non_extreme_delay,
                main = "Boxplot for Delay grouped by origin airport",
                 col=(c("white","red","gray","lightblue")))

Boxplot for delay vs destination airport

par(las=2)
boxplot(ArrDelayMinutes ~ Dest , data = non_extreme_delay,
                main = "Boxplot for Delay grouped by dest airport",
                 col=(c("white","red","gray","lightblue")))

Boxplot for delay vs airlines

par(las=2)
boxplot(ArrDelayMinutes ~ Reporting_Airline , data = non_extreme_delay,
                main = "Boxplot for Delay grouped by airlines",col=(c("white","red","gray","lightblue")))

Boxplot for distance vs airlines

par(las=2)
boxplot(Distance ~ Reporting_Airline , data = non_extreme_delay,
                main = "Boxplot for Distance grouped by airlines",col=(c("white","red","gray","lightblue")))

#It seems from the graph that Alaska Airlines (AS) and United Airlines (UA) have the more long distance flights as compared to others. 
#Hence they seems to be long haul carriers

Boxplot for flight distance (in miles) , split by delay/non-delay flight status

boxplot(Distance ~ ArrDel15,
                main = "Flight distance(in miles)  , split by delay/non-delay status",
                 col=c("lightblue","red"),xlab = "Arrival Delay",ylab = "Distance(in miles)")

#Insight: No major difference in distances travelled by delayed & non-delayed flights

Part 26:

#Correlation Matrix for all the Continuous Variable

cor(airline.df[,c(10,17,22,23,24)])
##                   DepDelayMinutes ArrDelayMinutes ActualElapsedTime
## DepDelayMinutes       1.000000000    0.9749606618        0.01216335
## ArrDelayMinutes       0.974960662    1.0000000000        0.03899857
## ActualElapsedTime     0.012163348    0.0389985688        1.00000000
## AirTime               0.004144378    0.0102041639        0.98788870
## Distance              0.004538660   -0.0005893142        0.96674226
##                       AirTime      Distance
## DepDelayMinutes   0.004144378  0.0045386600
## ArrDelayMinutes   0.010204164 -0.0005893142
## ActualElapsedTime 0.987888697  0.9667422574
## AirTime           1.000000000  0.9812425334
## Distance          0.981242533  1.0000000000

Part 27:

#Plotting Correlation Matrix

#install.packages("PerformanceAnalytics")

library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 3.6.1
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.6.1
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
## 
##     textplot
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(airline.df[,c(10,17,22,23,24)],histogram=TRUE,pch=19)

#Arrival delay & Departure delay show a strong correlation as is expected among the two variables.

Part 28: Univariate assessment of raw variables without outlier treatment

Below analysis helps us in identifying the key predictive variables for predicting delays

# Train data 75% of the sample size
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
smp_size <- floor(0.75 * nrow(air_line.df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(air_line.df)), size = smp_size)

#Test and train data
data_train <- air_line.df[train_ind, ]
data_test <- air_line.df[-train_ind, ]


univ_gini <- NULL
name <- NULL
P_value <- NULL
area_curve <- NULL 
# Developing univariate assessment only on numerical or binary variables. We can also try on other categorical variables like airline but for that, we may need to create binary indicators or use log odds transformation.
for (i in colnames(data_train)[c(19,20,21,22:33)])
{
  model_uni <- glm(ArrDel15~data_train[,i],family=gaussian, data_train)
  y_glm_uni <- predict(model_uni,data.frame(data_train[,i]))
  roc_model_uni <- roc(data_train$ArrDel15,y_glm_uni)
  auc <- roc_model_uni$auc
  P <- summary(model_uni)$coefficients[2,4]
  univ_gini <- c(2*auc-1,univ_gini)
  area_curve <- c(auc, area_curve) 
  name <- c(i,name)
  P_value <- c(P,P_value)
  
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Univ_summary <- data.frame(name,area_curve,univ_gini,P_value)
colnames(Univ_summary)<- c("Variable","AUC","Gini","P value")
Univ_summary
##             Variable       AUC         Gini       P value
## 1             Others 0.5090174 0.0180347354  1.271778e-71
## 2            Skywest 0.5020255 0.0040509007  6.071772e-09
## 3            Jetblue 0.5093462 0.0186923365  0.000000e+00
## 4             Alaska 0.5002999 0.0005997697  1.475974e-01
## 5             United 0.5035779 0.0071557219  5.053364e-29
## 6              Delta 0.5259673 0.0519345980  0.000000e+00
## 7           American 0.5042746 0.0085491679  4.751915e-30
## 8          Southwest 0.5019742 0.0039484947  7.665062e-06
## 9              PMARR 0.5506132 0.1012263755  0.000000e+00
## 10             PMDEP 0.5629169 0.1258338424  0.000000e+00
## 11           WEEKEND 0.5159882 0.0319764157 2.608621e-227
## 12          Distance 0.5116839 0.0233678503  1.744667e-27
## 13           AirTime 0.5266220 0.0532439737 4.072977e-282
## 14 ActualElapsedTime 0.5663549 0.1327098172  0.000000e+00
## 15    CRSElapsedTime 0.5120729 0.0241458889  1.257214e-31
# This analysis gives an initial indication that the variable- "PMARR", "PMDEP", "Delta" can be potential strong predictors of flight delay. Also as we can see the three variables are significant implying the relationship is statistically valid. We will probably need to create new variables or transformations to develop higher predictive power in the model