Final Week5

Prepare the environment and Data

library("ggplot2")
library("hflights")

data(hflights)
str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
head(hflights, n=10)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 2011     1          1         6    1400    1500            AA       428  N576AA                60      40      -10        0    IAH  DFW      224      7      13         0                         0
## 5425 2011     1          2         7    1401    1501            AA       428  N557AA                60      45       -9        1    IAH  DFW      224      6       9         0                         0
## 5426 2011     1          3         1    1352    1502            AA       428  N541AA                70      48       -8       -8    IAH  DFW      224      5      17         0                         0
## 5427 2011     1          4         2    1403    1513            AA       428  N403AA                70      39        3        3    IAH  DFW      224      9      22         0                         0
## 5428 2011     1          5         3    1405    1507            AA       428  N492AA                62      44       -3        5    IAH  DFW      224      9       9         0                         0
## 5429 2011     1          6         4    1359    1503            AA       428  N262AA                64      45       -7       -1    IAH  DFW      224      6      13         0                         0
## 5430 2011     1          7         5    1359    1509            AA       428  N493AA                70      43       -1       -1    IAH  DFW      224     12      15         0                         0
## 5431 2011     1          8         6    1355    1454            AA       428  N477AA                59      40      -16       -5    IAH  DFW      224      7      12         0                         0
## 5432 2011     1          9         7    1443    1554            AA       428  N476AA                71      41       44       43    IAH  DFW      224      8      22         0                         0
## 5433 2011     1         10         1    1443    1553            AA       428  N504AA                70      45       43       43    IAH  DFW      224      6      19         0                         0
myData = data.frame(hflights)

myData = subset(myData,select = c(UniqueCarrier,FlightNum,Distance,Cancelled,CancellationCode,Dest))

myData = subset(myData,Cancelled==1)

unique(myData$UniqueCarrier)
##  [1] "AA" "B6" "CO" "DL" "OO" "UA" "US" "WN" "EV" "FL" "MQ" "XE" "F9" "YV"
##num of cariers
length(unique(myData$UniqueCarrier))
## [1] 14
unique(myData$CancellationCode)
## [1] "A" "B" "C" "D"
##Num of reason for cancelations
length(unique(myData$CancellationCode))
## [1] 4
myData$CancellationCode =  c('A'="Carrier",'B'="Weather",'C'="Nat air sys",'D'="Security")[ as.character(myData$CancellationCode)]

#Display modified data
head(myData, n=10)
##       UniqueCarrier FlightNum Distance Cancelled CancellationCode Dest
## 33074            AA      1700      964         1          Carrier  MIA
## 35264            AA      1820      224         1          Weather  DFW
## 63546            B6       624     1428         1          Weather  JFK
## 67826            B6       624     1428         1          Carrier  JFK
## 72078            B6       624     1428         1          Weather  JFK
## 74874            CO       442      429         1          Weather  TUL
## 74903            CO       500      845         1          Weather  IND
## 75248            CO      1711      395         1          Weather  OKC
## 76105            CO       408     1400         1          Carrier  EWR
## 76550            CO       158     1208         1          Weather  DCA

Data exploration Using plots and Subsetting

#I start by aggregating the data by cancelation code.

xReasons=aggregate(myData$Cancelled ~ CancellationCode, myData, sum)

head(xReasons, n=5)
##   CancellationCode myData$Cancelled
## 1          Carrier             1202
## 2      Nat air sys              118
## 3         Security                1
## 4          Weather             1652
ggplot(xReasons, aes(x=xReasons$CancellationCode, y=xReasons$`myData$Cancelled`)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=xReasons$`myData$Cancelled`), vjust=-0.2)+  
theme(legend.position='none') +ylab("")+xlab("") +
ggtitle("Reasons for Cancelations")  

##The previous plot revealed that top cancelation reasons are weather and carrier related.
       
Gcarrier=aggregate(myData$Cancelled~ UniqueCarrier, myData, sum)

ggplot(Gcarrier, aes(x=Gcarrier$UniqueCarrier, y=Gcarrier$`myData$Cancelled`)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=Gcarrier$`myData$Cancelled`), vjust=-0.2)+  
  theme(legend.position='none') +ylab("")+xlab("") +
  ggtitle("Cancelations by Carriers") 

# Carier XE,WN,CO account for the highest cancelations. I will proceed to investigate carrier XE.

# I have subsetted my data to investigate carrier XE
myData = subset(myData,UniqueCarrier=="XE")


xEC=aggregate(myData$Cancelled~ CancellationCode, myData, sum)

ggplot(xEC, aes(x=xEC$CancellationCode, y=xEC$`myData$Cancelled`))+
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=xEC$`myData$Cancelled`), vjust=-0.2)+  
  theme(legend.position='none') +ylab("")+xlab("") +
  ggtitle("Cancelations for Carrier XE") 

  #After Plotting carrier XE, we find that 66% of cancelations are due to weather.

# I filter my data in order to invetigate flights canceled due to weather for carrier XE.
myData = subset(myData,CancellationCode=="Weather")

# I aggregated cancelations due to weather by destination
desT=aggregate(myData$Cancelled~ Dest, myData, sum)

head(desT, n=10)
##    Dest myData$Cancelled
## 1   ABQ                7
## 2   AEX                7
## 3   AMA               22
## 4   ATL               11
## 5   AUS                4
## 6   AVL                2
## 7   BHM               19
## 8   BNA               15
## 9   BRO               18
## 10  BTR               16
# took top ten destinations with cancelations.
myData = subset(myData,Dest == "DAL"| Dest =="LFT"| Dest =="CRP"| Dest =="ICT"| Dest =="AMA"| Dest =="XNA"| Dest =="HRL"| Dest =="TUL"| Dest =="MOB"| Dest =="MCI")

#I then plot the top ten cancelation destinations due to weather and grouped by flight distance.
ggplot(myData, aes(x=myData$Dest, y=myData$Cancelled))+
  geom_bar(stat="identity", fill="steelblue")+facet_wrap( ~Distance, ncol = 6)+
  theme(legend.position='none') +ylab("")+xlab("") +
  ggtitle("Top Destinations Cancelations due to weather Group by Distance(M)") 

Conclusion

It was was interesting to find that many of the flights cancelled by weather were within a distance of 201 miles.The Hflights data was quite interesting. I would had liked to have a column indicating domestic and international flights.