Loading necessary libraries
require(hflights)
require(ggplot2)
require(plyr)
Creating a subset “subflights”
subflights = subset(hflights,select=c("Year","Month","DayofMonth","UniqueCarrier", "DepDelay", "Cancelled","CancellationCode"))
Creating a new column that will store Date
subflights$Date <- paste(subflights$Year, subflights$Month, subflights$DayofMonth, sep ="")
head(subflights)
## Year Month DayofMonth UniqueCarrier DepDelay Cancelled
## 5424 2011 1 1 AA 0 0
## 5425 2011 1 2 AA 1 0
## 5426 2011 1 3 AA -8 0
## 5427 2011 1 4 AA 3 0
## 5428 2011 1 5 AA 5 0
## 5429 2011 1 6 AA -1 0
## CancellationCode Date
## 5424 201111
## 5425 201112
## 5426 201113
## 5427 201114
## 5428 201115
## 5429 201116
Removing unneeded Columns
subflights$Year = NULL
subflights$DayofMonth = NULL
subflights$Month = NULL
head(subflights)
## UniqueCarrier DepDelay Cancelled CancellationCode Date
## 5424 AA 0 0 201111
## 5425 AA 1 0 201112
## 5426 AA -8 0 201113
## 5427 AA 3 0 201114
## 5428 AA 5 0 201115
## 5429 AA -1 0 201116
Looking at Cancelation data by Carriers in order to see which carrier has the highest flight cancealation rate due to Carrier Fault.
cancelations=subflights[subflights$CancellationCode =='A',]
cancel = count(cancelations$UniqueCarrier)
flights = count(subflights$UniqueCarrier)
Cleaning Dataset and binding the data
flights = flights[-c(2),]
rownames(flights) <- seq(length=nrow(flights))
data=cbind(flights,cancel$freq)
colnames(data) = c("carriers","flights","cancelations")
Calculating Cancelation rate per Airline and displaying final dataset
data$cancelrate=round(data$cancelations/data$flights, digits=4)
data
## carriers flights cancelations cancelrate
## 1 AA 3244 20 0.0062
## 2 B6 695 5 0.0072
## 3 CO 70032 37 0.0005
## 4 DL 2641 13 0.0049
## 5 EV 2204 60 0.0272
## 6 F9 838 2 0.0024
## 7 FL 2139 8 0.0037
## 8 MQ 4648 39 0.0084
## 9 OO 16061 121 0.0075
## 10 UA 2072 21 0.0101
## 11 US 4082 27 0.0066
## 12 WN 45343 517 0.0114
## 13 XE 73053 331 0.0045
## 14 YV 79 1 0.0127
Plotting Cancellation Rate by Airlines
ggplot(data, aes(x=carriers, y=cancelrate)) + geom_bar(stat="identity") + xlab('Airlines') +ylab('Cancelation Rate due to Airline')
Comparing Mean Departure Delay by Airlines
DepD = aggregate(DepDelay~UniqueCarrier, data=hflights,mean)
ggplot(DepD, aes(x=UniqueCarrier, y=DepDelay)) + geom_bar(stat="identity")
Comparing Mean Arrival Delay by Airlines
ArrD = aggregate(ArrDelay~UniqueCarrier, data=hflights,mean)
ggplot(ArrD, aes(x=UniqueCarrier, y=ArrDelay)) + geom_bar(stat="identity")
Comparing Mean Departure Delay Times in 2 Airports
OriginDepDelay = aggregate(DepDelay~Origin, data=hflights,mean)
OriginDepDelay
## Origin DepDelay
## 1 HOU 12.837873
## 2 IAH 8.436951
Comparng Mean Arrival Delay Times in 2 Airports
OriginArrDelay = aggregate(ArrDelay~Origin, data=hflights,mean)
OriginArrDelay
## Origin ArrDelay
## 1 HOU 7.487946
## 2 IAH 6.977301
Comparing Mean Taxin Times in 2 Airports
aggregate(TaxiIn~Origin, data=hflights,mean)
## Origin TaxiIn
## 1 HOU 5.150205
## 2 IAH 6.380680
Comparing Mean Taxiout Times in 2 Airports
aggregate(TaxiOut~Origin, data=hflights,mean)
## Origin TaxiOut
## 1 HOU 9.123702
## 2 IAH 16.864243