library("ggplot2")
library("hflights")
data(hflights)
str(hflights)## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
head(hflights, n=10)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 2011 1 1 6 1400 1500 AA 428 N576AA 60 40 -10 0 IAH DFW 224 7 13 0 0
## 5425 2011 1 2 7 1401 1501 AA 428 N557AA 60 45 -9 1 IAH DFW 224 6 9 0 0
## 5426 2011 1 3 1 1352 1502 AA 428 N541AA 70 48 -8 -8 IAH DFW 224 5 17 0 0
## 5427 2011 1 4 2 1403 1513 AA 428 N403AA 70 39 3 3 IAH DFW 224 9 22 0 0
## 5428 2011 1 5 3 1405 1507 AA 428 N492AA 62 44 -3 5 IAH DFW 224 9 9 0 0
## 5429 2011 1 6 4 1359 1503 AA 428 N262AA 64 45 -7 -1 IAH DFW 224 6 13 0 0
## 5430 2011 1 7 5 1359 1509 AA 428 N493AA 70 43 -1 -1 IAH DFW 224 12 15 0 0
## 5431 2011 1 8 6 1355 1454 AA 428 N477AA 59 40 -16 -5 IAH DFW 224 7 12 0 0
## 5432 2011 1 9 7 1443 1554 AA 428 N476AA 71 41 44 43 IAH DFW 224 8 22 0 0
## 5433 2011 1 10 1 1443 1553 AA 428 N504AA 70 45 43 43 IAH DFW 224 6 19 0 0
myData = data.frame(hflights)
myData = subset(myData,select = c(UniqueCarrier,FlightNum,Distance,Cancelled,CancellationCode,Dest))
myData = subset(myData,Cancelled==1)
unique(myData$UniqueCarrier)## [1] "AA" "B6" "CO" "DL" "OO" "UA" "US" "WN" "EV" "FL" "MQ" "XE" "F9" "YV"
##num of cariers
length(unique(myData$UniqueCarrier))## [1] 14
unique(myData$CancellationCode)## [1] "A" "B" "C" "D"
##Num of reason for cancelations
length(unique(myData$CancellationCode))## [1] 4
myData$CancellationCode = c('A'="Carrier",'B'="Weather",'C'="Nat air sys",'D'="Security")[ as.character(myData$CancellationCode)]
#Display modified data
head(myData, n=10)## UniqueCarrier FlightNum Distance Cancelled CancellationCode Dest
## 33074 AA 1700 964 1 Carrier MIA
## 35264 AA 1820 224 1 Weather DFW
## 63546 B6 624 1428 1 Weather JFK
## 67826 B6 624 1428 1 Carrier JFK
## 72078 B6 624 1428 1 Weather JFK
## 74874 CO 442 429 1 Weather TUL
## 74903 CO 500 845 1 Weather IND
## 75248 CO 1711 395 1 Weather OKC
## 76105 CO 408 1400 1 Carrier EWR
## 76550 CO 158 1208 1 Weather DCA
#I start by aggregating the data by cancelation code.
xReasons=aggregate(myData$Cancelled ~ CancellationCode, myData, sum)
head(xReasons, n=5)## CancellationCode myData$Cancelled
## 1 Carrier 1202
## 2 Nat air sys 118
## 3 Security 1
## 4 Weather 1652
ggplot(xReasons, aes(x=xReasons$CancellationCode, y=xReasons$`myData$Cancelled`)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=xReasons$`myData$Cancelled`), vjust=-0.2)+
theme(legend.position='none') +ylab("")+xlab("") +
ggtitle("Reasons for Cancelations") ##The previous plot revealed that top cancelation reasons are weather and carrier related.
Gcarrier=aggregate(myData$Cancelled~ UniqueCarrier, myData, sum)
ggplot(Gcarrier, aes(x=Gcarrier$UniqueCarrier, y=Gcarrier$`myData$Cancelled`)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=Gcarrier$`myData$Cancelled`), vjust=-0.2)+
theme(legend.position='none') +ylab("")+xlab("") +
ggtitle("Cancelations by Carriers") # Carier XE,WN,CO account for the highest cancelations. I will proceed to investigate carrier XE.
# I have subsetted my data to investigate carrier XE
myData = subset(myData,UniqueCarrier=="XE")
xEC=aggregate(myData$Cancelled~ CancellationCode, myData, sum)
ggplot(xEC, aes(x=xEC$CancellationCode, y=xEC$`myData$Cancelled`))+
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=xEC$`myData$Cancelled`), vjust=-0.2)+
theme(legend.position='none') +ylab("")+xlab("") +
ggtitle("Cancelations for Carrier XE") #After Plotting carrier XE, we find that 66% of cancelations are due to weather.
# I filter my data in order to invetigate flights canceled due to weather for carrier XE.
myData = subset(myData,CancellationCode=="Weather")
# I aggregated cancelations due to weather by destination
desT=aggregate(myData$Cancelled~ Dest, myData, sum)
head(desT, n=10)## Dest myData$Cancelled
## 1 ABQ 7
## 2 AEX 7
## 3 AMA 22
## 4 ATL 11
## 5 AUS 4
## 6 AVL 2
## 7 BHM 19
## 8 BNA 15
## 9 BRO 18
## 10 BTR 16
# took top ten destinations with cancelations.
myData = subset(myData,Dest == "DAL"| Dest =="LFT"| Dest =="CRP"| Dest =="ICT"| Dest =="AMA"| Dest =="XNA"| Dest =="HRL"| Dest =="TUL"| Dest =="MOB"| Dest =="MCI")
#I then plot the top ten cancelation destinations due to weather and grouped by flight distance.
ggplot(myData, aes(x=myData$Dest, y=myData$Cancelled))+
geom_bar(stat="identity", fill="steelblue")+facet_wrap( ~Distance, ncol = 6)+
theme(legend.position='none') +ylab("")+xlab("") +
ggtitle("Top Destinations Cancelations due to weather Group by Distance(M)") It was was interesting to find that many of the flights cancelled by weather were within a distance of 201 miles.The Hflights data was quite interesting. I would had liked to have a column indicating domestic and international flights.