library(hflights)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Days of the week were numerical, so instead of having to remember what day of the week corresponded to which number, I changed the DayOfWeek values to the appropriate character values
hflights$DayOfWeek <- gsub(1, 'Monday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(2, 'Tuesday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(3, 'Wednesday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(4, 'Thursday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(5, 'Friday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(6, 'Saturday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(7, 'Sunday', hflights$DayOfWeek)
flight_date <- paste(hflights$Month, hflights$DayofMonth, hflights$Year, sep="/")
hflights$Date <- flight_date
hflights$Date <- as.Date(flight_date, "%m/%d/%Y")
Now that I have some new data, and some columns that I’m not interested in, I combined the data into a new data frame.
myflights <- subset(hflights, select=c(Date, DayOfWeek, UniqueCarrier, FlightNum, TailNum, TaxiIn, TaxiOut, AirTime, ArrDelay,
DepDelay, ActualElapsedTime, Origin, Dest, Distance))
I created a combined field by dividing the total distance of the flight by the “Actual Elapsed Time” field. This gave me a number I could use to convey the range of time it took to fly the same distance.
dis_per_min <- (myflights$Distance / myflights$ActualElapsedTime)
myflights$mi_per_min <- dis_per_min
Here is a preview of my new data frame:
head(myflights)
## Date DayOfWeek UniqueCarrier FlightNum TailNum TaxiIn TaxiOut
## 5424 2011-01-01 Saturday AA 428 N576AA 7 13
## 5425 2011-01-02 Sunday AA 428 N557AA 6 9
## 5426 2011-01-03 Monday AA 428 N541AA 5 17
## 5427 2011-01-04 Tuesday AA 428 N403AA 9 22
## 5428 2011-01-05 Wednesday AA 428 N492AA 9 9
## 5429 2011-01-06 Thursday AA 428 N262AA 6 13
## AirTime ArrDelay DepDelay ActualElapsedTime Origin Dest Distance
## 5424 40 -10 0 60 IAH DFW 224
## 5425 45 -9 1 60 IAH DFW 224
## 5426 48 -8 -8 70 IAH DFW 224
## 5427 39 3 3 70 IAH DFW 224
## 5428 44 -3 5 62 IAH DFW 224
## 5429 45 -7 -1 64 IAH DFW 224
## mi_per_min
## 5424 3.733333
## 5425 3.733333
## 5426 3.200000
## 5427 3.200000
## 5428 3.612903
## 5429 3.500000
I wanted to see if there was any pattern to the distance traveled, the flight time, and the day of the week.
qplot(Distance, ActualElapsedTime, color=DayOfWeek, data=myflights, main='Total Flight Time by Distance')
## Warning: Removed 3622 rows containing missing values (geom_point).
What about departure delays by day of the week? Looks like the days with greater amounts of delays simply have more flights.
qplot(DepDelay, data=myflights, geom='histogram', binwidth = 5, xlim =c(-25, 200), main='Departure Delays by Day of Week') + facet_wrap(~DayOfWeek)
I decided to work with a smaller data set, so I looked at the airlines with the most flights, and the destinations with the most flights.
ggplot(data=myflights) + geom_bar(aes(x=UniqueCarrier), bin=5) + ggtitle('Flights by Airline')
Looks like there are a lot more destinations to work with, so I will reduce the number to the destinations with the most flights.
dests <- ggplot(data=myflights) + geom_bar(aes(x=Dest))
dests + theme(axis.text.x=element_text(angle=-90)) + ggtitle('Flights by Destination')
I could have used dplyr
to get this information, but I wanted to get familiar with various ways of selecting data. Here, I use table
to select the Destination column, and turn that into a data frame. I can then subset that to destinations with 5000 flights or more.
top_dests <- data.frame(table(myflights$Dest))
top_dests <- subset(top_dests, Freq > 5000)
head(top_dests)
## Var1 Freq
## 7 ATL 7886
## 8 AUS 5022
## 29 DAL 9820
## 32 DEN 5920
## 33 DFW 6653
## 60 LAX 6064
Now that I know the top destinations, I will create a dataframe with the flights information, but only for those selected cities:
most_dest <- filter(myflights, Dest == 'ATL' | Dest == 'AUS' | Dest == 'DAL' | Dest == 'DEN' | Dest == 'DFW' | Dest == 'LAX' |
Dest == 'MSY' | Dest == 'ORD' | Dest == 'PHX')
Destinations with highest frequency of flights, range of distance covered per minute (includes taxi time, flight, delays)
Destinations with miles per minute, by carrier - who is the best performer per destination?
qplot(mi_per_min, data=most_dest, geom='density', color=UniqueCarrier) + facet_grid(Dest ~.)