# reading external data and storing into a dataframe called "airline.df"
airline.df <- read.csv("Fdelay.csv")
air_line.df <- airline.df[,-c(1,2)]
# Display the column names
colnames(airline.df)
## [1] "X" "V1" "FlightDate"
## [4] "Reporting_Airline" "Origin" "Dest"
## [7] "CRSDepTime" "DepTime" "DepDelay"
## [10] "DepDelayMinutes" "DepDel15" "DepartureDelayGroups"
## [13] "DepTimeBlk" "CRSArrTime" "ArrTime"
## [16] "ArrDelay" "ArrDelayMinutes" "ArrDel15"
## [19] "ArrivalDelayGroups" "ArrTimeBlk" "CRSElapsedTime"
## [22] "ActualElapsedTime" "AirTime" "Distance"
## [25] "WEEKEND" "PMDEP" "PMARR"
## [28] "Southwest" "American" "Delta"
## [31] "United" "Alaska" "Jetblue"
## [34] "Skywest" "Others" "DepStatus"
## [37] "ArrStatus" "airline" "timewindowdep"
## [40] "timewindowarr" "daywindow"
# Display the Data Dimensions
dim(airline.df)
## [1] 1774818 41
#Check missing values if any.
#Missing value treatment
sum(is.na(airline.df))
## [1] 0
# Sum of missing values is 0 so no missing values
library(psych)
describe(air_line.df[,c(8:10,15,16,17,19,20,21,22:33)])[,2:6]
## n mean sd min max
## DepDelayMinutes 1774818 11.69 41.45 0 2109
## DepDel15 1774818 0.17 0.38 0 1
## DepartureDelayGroups 1774818 -0.06 2.03 -2 12
## ArrDelayMinutes 1774818 12.14 41.51 0 2153
## ArrDel15 1774818 0.18 0.39 0 1
## ArrivalDelayGroups 1774818 -0.26 2.18 -2 12
## CRSElapsedTime 1774818 140.85 72.91 -99 703
## ActualElapsedTime 1774818 136.25 72.50 14 723
## AirTime 1774818 111.31 70.74 7 695
## Distance 1774818 797.11 593.98 31 4983
## WEEKEND 1774818 0.27 0.44 0 1
## PMDEP 1774818 0.58 0.49 0 1
## PMARR 1774818 0.70 0.46 0 1
## Southwest 1774818 0.19 0.39 0 1
## American 1774818 0.13 0.33 0 1
## Delta 1774818 0.13 0.34 0 1
## United 1774818 0.09 0.28 0 1
## Alaska 1774818 0.04 0.18 0 1
## Jetblue 1774818 0.04 0.20 0 1
## Skywest 1774818 0.11 0.31 0 1
## Others 1774818 0.28 0.45 0 1
attach(air_line.df)
tab <- table(ArrDel15)
PercentProportion <- round(prop.table(tab)*100,1)
PercentProportion
## ArrDel15
## 0 1
## 81.9 18.1
hist(ArrDelayMinutes[which(ArrDelayMinutes<quantile(ArrDelayMinutes,.99))],xlim=c(15,180),ylim=c(0,200000), breaks=15, main="Distribution of flight delay frequency by no. of minutes (>15 min. delay cases)", xlab="Minutes of delay in arrival")
bar <- barplot(PercentProportion, col = c("skyblue","pink"),
xlab = "Delay",ylab = "Percentage (%)",
main = "% Of flights delayed / Not delayed",legend = rownames(PercentProportion))
text(bar, 0, PercentProportion,cex=1,pos=3)
tab <- table(Reporting_Airline,ArrDel15)
PercentProportion <- round(prop.table(tab,1)*100,1)
PercentProportion
## ArrDel15
## Reporting_Airline 0 1
## 9E 82.0 18.0
## AA 80.9 19.1
## AS 82.0 18.0
## B6 75.4 24.6
## DL 87.7 12.3
## EV 75.1 24.9
## F9 71.8 28.2
## G4 80.9 19.1
## HA 90.6 9.4
## MQ 80.8 19.2
## NK 85.6 14.4
## OH 83.6 16.4
## OO 81.3 18.7
## UA 80.7 19.3
## WN 82.1 17.9
## YV 76.9 23.1
## YX 82.1 17.9
temp <- PercentProportion[,2]
bar <- barplot(temp,xlab = "Flight carrier code",ylab = "Percentage (%)",
main = "Bar chart for % Of flights, which got delayed split by carrier",legend = rownames(temp), col="pink")
text(bar, 0, temp,cex=1,pos=3)
#attaching dataframe
attach(airline.df)
## The following objects are masked from air_line.df:
##
## ActualElapsedTime, airline, AirTime, Alaska, American,
## ArrDel15, ArrDelay, ArrDelayMinutes, ArrivalDelayGroups,
## ArrStatus, ArrTime, ArrTimeBlk, CRSArrTime, CRSDepTime,
## CRSElapsedTime, daywindow, Delta, DepartureDelayGroups,
## DepDel15, DepDelay, DepDelayMinutes, DepStatus, DepTime,
## DepTimeBlk, Dest, Distance, FlightDate, Jetblue, Origin,
## Others, PMARR, PMDEP, Reporting_Airline, Skywest, Southwest,
## timewindowarr, timewindowdep, United, WEEKEND
# creating table for counts
counts <- table(ArrDel15,Reporting_Airline)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
xlab = "Reporting_Airline",ylab = "ArrDel15",
main = "Delay Vs Airline",beside=TRUE,
legend = rownames(counts))
# WN has the highest delay while HA has the lowest delay
# creating table for counts
counts <- table(ArrDel15,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
xlab = "WEEKEND",ylab = "ArrDel15",
main = "Delay vs Weekend",beside=TRUE,
legend = rownames(counts))
#Delay in Weekday is higher than Weekend
# creating table for counts
counts <- table(PMDEP,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
xlab = "PMDEP",ylab = "ArrDel15",
main = "Delay vs Departure time",beside=TRUE,
legend = rownames(counts))
#Delay is higher when flight departs in AM than when it departs in PM
# creating table for counts
counts <- table(PMARR,WEEKEND)
# plotting grouped bar plot
barplot(counts, col = c("white","black"),
xlab = "PMARR",ylab = "ArrDel15",
main = "Delay vs Arrival time",beside=TRUE,
legend = rownames(counts))
#Delay is higher when flight arrives in AM than when it arrives in PM
boxplot( AirTime ~ ArrDel15, main = "AirTime vs Delay", col=(c("red","white")))
#The number of outliers is high in the above comparison. Only conclusion that can be inferred is when the airtime is very high i.e. above 600, delay is lesser as there are lesser points at that region
library(data.table)
airline.dt=as.data.table(airline.df)
airline.dt[, ArrDel15 := as.factor(ArrDel15)]
bpa <- barplot(prop.table(tab[2,])*100, col = "lightblue", xlab = "Arrival delay", ylab= "Percentage of delayed departure", main = "Barplot of delayed departure leading to delayed/non -delayed arrival",args.legend = list(title = "Joined", x = "topright", cex = .7),ylim = c(0, 50) )
text(bpa, 0, round(prop.table(tab[2,])*100, 1),cex=1,pos=3)
#Insight:
# 79.7% of flight departing with delay resulted in delayed arrival
# 20.3% of flights departing with delay arrived on time
barplot(prop.table(tab[,2])*100, col = "lightblue", xlab = "Departure delay", ylab= "Percentage of delayed arrival", main = "Barplot of delayed arrival split by delayed/non-delayed departure", args.legend = list(title = "Joined", x = "topright", cex = .7),ylim = c(0, 50) )
text(bpa, 0, round(prop.table(tab[,2])*100, 1),cex=1,pos=3)
#Insight: 74.4% of delayed arrival was preceeded by delayed departure
# 25.6% of delayed arrival was preceeded by non-delayed departed
dt1 <- airline.dt[,.(N=.N,AverageDistance = mean(Distance),AverageDepartureDelay = mean(DepDelayMinutes), AverageArrivalDelay = mean(ArrDelayMinutes)),by=.(ArrivalDelay = airline.dt$ArrDel15)]
dt1
## ArrivalDelay N AverageDistance AverageDepartureDelay
## 1: 0 1452702 794.4130 1.887418
## 2: 1 322116 809.2458 55.921997
## AverageArrivalDelay
## 1: 1.345451
## 2: 60.826413
dt2 <- airline.dt[,.(N=.N,MeanDifferenceActualAndElapsedTime= mean(abs(CRSElapsedTime-ActualElapsedTime))),by=.(ArrivalDelay = airline.dt$ArrDel15)]
dt2
## ArrivalDelay N MeanDifferenceActualAndElapsedTime
## 1: 0 1452702 9.816577
## 2: 1 322116 14.716056
tab <- table(DepartureDelay = DepDel15,ArrivalDelay = ArrDel15)
addmargins(tab)
## ArrivalDelay
## DepartureDelay 0 1 Sum
## 0 1391657 82554 1474211
## 1 61045 239562 300607
## Sum 1452702 322116 1774818
proptable <- round(prop.table(tab)*100,2)
proptable
## ArrivalDelay
## DepartureDelay 0 1
## 0 78.41 4.65
## 1 3.44 13.50
#Insight:
# 78.41% flights departed and arrived without delay.
# 4.65% flights departed without delay but arrived with delay
# 3.44% flights departed with delay but arrived on time
# 13.50% flights departed and arrived with delay
round(prop.table(tab[2,])*100,2)
## 0 1
## 20.31 79.69
#Insight:
# 79.69% of flight departing with delay resulted in delayed arrival
# 20.31% of flights departing with delay arrived on time
round(prop.table(tab[,2])*100,2)
## 0 1
## 25.63 74.37
#Insight: 74.37% of delayed arrival was preceeded by delayed departure
# 25.63% of delayed arrival was preceeded by non-delayed departed
cdplot(round(airline.dt$CRSDepTime/100,0),airline.dt$ArrDel15,xlab = "CRS Departure time (hh)",ylab="Arrival Delay",main = "Density plot of flights delay/non-delay status w.r.t CRS Departure time", col=c("lightblue","red"))
#Insight : Density of delayed flight at any point of time increases steadily for flight deaprting after early morning hours till early night hours. Density of delayed flight is minimum for those flights departing around 5 am.
cdplot(round(airline.dt$CRSArrTime/100,0),airline.dt$ArrDel15,xlab = "CRS Arrival time (hh)",ylab="Arrival Delay",main = "Density plot of flights delay/non-delay status w.r.t CRS Arrival time", col=c("lightblue","red"))
#Insight : Density of delayed flight at any point of time increases steadily for flight arriving after morning hours till early morning of next day and decrease in trend occurs around 2am. Density of delayed flight is minimum for those flights arriving around 8 am.
round(prop.table(ftable(Reporting_Airline + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
## Reporting_Airline 9E AA AS B6 DL EV F9 G4 HA MQ NK OH OO UA WN YV YX
## DepDel15 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
## ArrDel15
## 0 95.20 15.84 92.94 19.74 92.38 22.07 93.31 15.86 95.87 28.87 91.99 9.44 92.20 19.42 95.28 16.10 95.85 18.63 93.21 15.63 95.53 21.11 95.82 19.65 93.88 14.92 93.00 16.83 96.54 26.98 92.50 10.15 93.20 16.23
## 1 4.80 84.16 7.06 80.26 7.62 77.93 6.69 84.14 4.13 71.13 8.01 90.56 7.80 80.58 4.72 83.90 4.15 81.37 6.79 84.37 4.47 78.89 4.18 80.35 6.12 85.08 7.00 83.17 3.46 73.02 7.50 89.85 6.80 83.77
#IF departure is on time then arrival is probably on time
#If departure is delayed then arrival is delayed
#For EV model Arrival is delayed highest even if departure is on time
round(prop.table(ftable(airline + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
## airline Alaska American Delta Frontier Hawaiian Jetblue Others Skywest Southwest Spirit United
## DepDel15 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
## ArrDel15
## 0 92.38 22.07 92.94 19.74 95.87 28.87 92.20 19.42 95.85 18.63 93.31 15.86 93.86 14.80 93.88 14.92 96.54 26.98 95.53 21.11 93.00 16.83
## 1 7.62 77.93 7.06 80.26 4.13 71.13 7.80 80.58 4.15 81.37 6.69 84.14 6.14 85.20 6.12 85.08 3.46 73.02 4.47 78.89 7.00 83.17
#Southewest is performing best i.e Departurre and arrival is on time
#For Delta and Southwest, Even if Departure is delayed they are try to arrive on time
#For Skywest, if flight is delayed in departure it has highest probability to arrive late
round(prop.table(ftable(daywindow + DepDel15 ~ ArrDel15, data = airline.df),2)*100,2)
## daywindow Weekday Weekend
## DepDel15 0 1 0 1
## ArrDel15
## 0 94.12 19.54 95.17 22.61
## 1 5.88 80.46 4.83 77.39
# on Weekday, Flights try to recover time lost in departure delay
round(prop.table(ftable(daywindow + DepDel15 ~ ArrDel15, data = airline.df),1)*100,2)
## daywindow Weekday Weekend
## DepDel15 0 1 0 1
## ArrDel15
## 0 69.78 3.03 26.02 1.17
## 1 19.67 56.36 5.96 18.02
#On weekdays, delays are highest
round(prop.table(ftable(daywindow + airline ~ ArrDel15, data = airline.df),2)*100,2)
## daywindow Weekday Weekend
## airline Alaska American Delta Frontier Hawaiian Jetblue Others Skywest Southwest Spirit United Alaska American Delta Frontier Hawaiian Jetblue Others Skywest Southwest Spirit United
## ArrDel15
## 0 80.67 80.13 87.10 72.43 90.31 74.55 79.57 81.21 81.62 84.63 80.05 85.65 82.81 89.58 70.32 91.27 77.63 83.40 81.54 83.36 87.70 82.50
## 1 19.33 19.87 12.90 27.57 9.69 25.45 20.43 18.79 18.38 15.37 19.95 14.35 17.19 10.42 29.68 8.73 22.37 16.60 18.46 16.64 12.30 17.50
#on an avg, Weekend has less arrival delays
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(Distance ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight distance(in miles) , split by delay/non-delay status",xlab = "Arrival Delay", ylab = "Distance (in miles)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
#Insight: Mean distance is almost similar for delayed and non-delayed flights
library(gplots)
plotmeans(ArrDelayMinutes ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight arrival delay(in minutes) , split by delay/non-delay flight status",xlab = "Arrival Delay",ylab="Arrival delay (in minutes)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
#Insight: Flight delayed are arriving late by a mean of around 60 minutes
library(gplots)
plotmeans(DepDelayMinutes ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for flight Departure delay(in minutes) , split by delay/non-delay flight status", xlab = "Arrival Delay", ylab = "Departure delay (in minutes)")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
#Insight: Flight arriving late are delayed in departure with a mean of around 55 minutes
library(gplots)
plotmeans(abs(CRSElapsedTime-ActualElapsedTime) ~ ArrDel15, data=airline.df,mean.labels = TRUE, col="RED", frame = FALSE, main="Mean plot for difference between actual and CRS elapsed time split by arrival delay/non-delay flight status", xlab="Arrival Delay", ylab = "Difference between actual and CRS elapsed time(in minutes")
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
#Insight: Mean difference between actual and CRS elapsed time is more for delayed flights by a quantum 5 minutes
quantile(airline.df$ArrDelayMinutes, robs = seq(0, 1, 0.25))
## 0% 25% 50% 75% 100%
## 0 0 0 7 2153
quantile(airline.df$ArrDelayMinutes, c(0.99, 0.999))
## 99% 99.9%
## 169 503
boxplot(airline.df$ArrDelayMinutes)
#As we can see, there are a lot of outliers in this arrival delay time distribution. 75% probability of distribution is at 7 mins and 99% is at 169. 99.9% is at 503. So we will take everything less than 169 as normal and remove any delay time > 169 as outlier
#Removing extremes with value > 169
extreme_delay<- airline.df[which(airline.df$ArrDelayMinutes>170),]
message("No of outliers: ", dim(extreme_delay)[1])
## No of outliers: 17431
non_extreme_delay<- airline.df[which(airline.df$ArrDelayMinutes<170),]
message("Actual Working Set length: ", dim(non_extreme_delay)[1])
## Actual Working Set length: 1757175
boxplot(non_extreme_delay$ArrDelayMinutes)
plot(non_extreme_delay$DepDelayMinutes, non_extreme_delay$ArrDelayMinutes, xlab = "Departure Delay Mins", ylab = "Arrival Delay Mins")
#As expected, from graph it can be seen that if there is a delay in departure of flight, there will be delay in arrival of flight. And it is more or less a linear trend.
plot(non_extreme_delay$Distance, non_extreme_delay$ArrDelayMinutes,xlab = "Distance", ylab = "Delay Minutes")
#No Clear Pattern between distance and delays. Low Distance area seems to be more darker because of more no of flights with distance < 3000 km
non_extreme_delay$DepDel15 = factor(non_extreme_delay$DepDel15)
boxplot(Distance ~ ArrDel15 , data = non_extreme_delay,
main = "Boxplot for Distance grouped by Delay greater than 15 mins/Delay less than 15",
col=(c("gray","lightblue")))
#Distance distribution seems to be same for both delay > 15 mins and delay < 15 mins. So there is not much impact of distance on delays
str(non_extreme_delay$Origin)
## Factor w/ 350 levels "ABE","ABI","ABQ",..: 183 183 183 183 183 183 183 215 215 215 ...
#Since there are 350 levels for Origin, boxplot will drop some levels as it cannot show all Origin. No clear insight can be drawn from the boxplot.
par(las=2)
boxplot(ArrDelayMinutes ~ Origin , data = non_extreme_delay,
main = "Boxplot for Delay grouped by origin airport",
col=(c("white","red","gray","lightblue")))
par(las=2)
boxplot(ArrDelayMinutes ~ Dest , data = non_extreme_delay,
main = "Boxplot for Delay grouped by dest airport",
col=(c("white","red","gray","lightblue")))
par(las=2)
boxplot(ArrDelayMinutes ~ Reporting_Airline , data = non_extreme_delay,
main = "Boxplot for Delay grouped by airlines",col=(c("white","red","gray","lightblue")))
par(las=2)
boxplot(Distance ~ Reporting_Airline , data = non_extreme_delay,
main = "Boxplot for Distance grouped by airlines",col=(c("white","red","gray","lightblue")))
#It seems from the graph that Alaska Airlines (AS) and United Airlines (UA) have the more long distance flights as compared to others.
#Hence they seems to be long haul carriers
boxplot(Distance ~ ArrDel15,
main = "Flight distance(in miles) , split by delay/non-delay status",
col=c("lightblue","red"),xlab = "Arrival Delay",ylab = "Distance(in miles)")
#Insight: No major difference in distances travelled by delayed & non-delayed flights
#Correlation Matrix for all the Continuous Variable
cor(airline.df[,c(10,17,22,23,24)])
## DepDelayMinutes ArrDelayMinutes ActualElapsedTime
## DepDelayMinutes 1.000000000 0.9749606618 0.01216335
## ArrDelayMinutes 0.974960662 1.0000000000 0.03899857
## ActualElapsedTime 0.012163348 0.0389985688 1.00000000
## AirTime 0.004144378 0.0102041639 0.98788870
## Distance 0.004538660 -0.0005893142 0.96674226
## AirTime Distance
## DepDelayMinutes 0.004144378 0.0045386600
## ArrDelayMinutes 0.010204164 -0.0005893142
## ActualElapsedTime 0.987888697 0.9667422574
## AirTime 1.000000000 0.9812425334
## Distance 0.981242533 1.0000000000
#Plotting Correlation Matrix
#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 3.6.1
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.6.1
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(airline.df[,c(10,17,22,23,24)],histogram=TRUE,pch=19)
#Arrival delay & Departure delay show a strong correlation as is expected among the two variables.
# Train data 75% of the sample size
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
smp_size <- floor(0.75 * nrow(air_line.df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(air_line.df)), size = smp_size)
#Test and train data
data_train <- air_line.df[train_ind, ]
data_test <- air_line.df[-train_ind, ]
univ_gini <- NULL
name <- NULL
P_value <- NULL
area_curve <- NULL
# Developing univariate assessment only on numerical or binary variables. We can also try on other categorical variables like airline but for that, we may need to create binary indicators or use log odds transformation.
for (i in colnames(data_train)[c(19,20,21,22:33)])
{
model_uni <- glm(ArrDel15~data_train[,i],family=gaussian, data_train)
y_glm_uni <- predict(model_uni,data.frame(data_train[,i]))
roc_model_uni <- roc(data_train$ArrDel15,y_glm_uni)
auc <- roc_model_uni$auc
P <- summary(model_uni)$coefficients[2,4]
univ_gini <- c(2*auc-1,univ_gini)
area_curve <- c(auc, area_curve)
name <- c(i,name)
P_value <- c(P,P_value)
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Univ_summary <- data.frame(name,area_curve,univ_gini,P_value)
colnames(Univ_summary)<- c("Variable","AUC","Gini","P value")
Univ_summary
## Variable AUC Gini P value
## 1 Others 0.5090174 0.0180347354 1.271778e-71
## 2 Skywest 0.5020255 0.0040509007 6.071772e-09
## 3 Jetblue 0.5093462 0.0186923365 0.000000e+00
## 4 Alaska 0.5002999 0.0005997697 1.475974e-01
## 5 United 0.5035779 0.0071557219 5.053364e-29
## 6 Delta 0.5259673 0.0519345980 0.000000e+00
## 7 American 0.5042746 0.0085491679 4.751915e-30
## 8 Southwest 0.5019742 0.0039484947 7.665062e-06
## 9 PMARR 0.5506132 0.1012263755 0.000000e+00
## 10 PMDEP 0.5629169 0.1258338424 0.000000e+00
## 11 WEEKEND 0.5159882 0.0319764157 2.608621e-227
## 12 Distance 0.5116839 0.0233678503 1.744667e-27
## 13 AirTime 0.5266220 0.0532439737 4.072977e-282
## 14 ActualElapsedTime 0.5663549 0.1327098172 0.000000e+00
## 15 CRSElapsedTime 0.5120729 0.0241458889 1.257214e-31
# This analysis gives an initial indication that the variable- "PMARR", "PMDEP", "Delta" can be potential strong predictors of flight delay. Also as we can see the three variables are significant implying the relationship is statistically valid. We will probably need to create new variables or transformations to develop higher predictive power in the model