hflights$DayOfWeek <- gsub(1, 'Monday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(2, 'Tuesday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(3, 'Wednesday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(4, 'Thursday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(5, 'Friday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(6, 'Saturday', hflights$DayOfWeek)
hflights$DayOfWeek <- gsub(7, 'Sunday', hflights$DayOfWeek)
flight_date <- paste(hflights$Month, hflights$DayofMonth, hflights$Year, sep="/")
hflights$Date <- flight_date
hflights$Date <- as.Date(flight_date, "%m/%d/%Y")
myflights <- subset(hflights, select=c(Date, DayOfWeek, UniqueCarrier, FlightNum, TailNum, TaxiIn, TaxiOut, AirTime, ArrDelay,
DepDelay, ActualElapsedTime, Origin, Dest, Distance))
I created a combined field by dividing the total distance of the flight by the “Actual Elapsed Time” field. This gave me a number I could use to convey the range of time it took to fly the same distance.
dis_per_min <- (myflights$Distance / myflights$ActualElapsedTime)
myflights$mi_per_min <- dis_per_min
head(myflights)
## Date DayOfWeek UniqueCarrier FlightNum TailNum TaxiIn TaxiOut
## 5424 2011-01-01 Saturday AA 428 N576AA 7 13
## 5425 2011-01-02 Sunday AA 428 N557AA 6 9
## 5426 2011-01-03 Monday AA 428 N541AA 5 17
## 5427 2011-01-04 Tuesday AA 428 N403AA 9 22
## 5428 2011-01-05 Wednesday AA 428 N492AA 9 9
## 5429 2011-01-06 Thursday AA 428 N262AA 6 13
## AirTime ArrDelay DepDelay ActualElapsedTime Origin Dest Distance
## 5424 40 -10 0 60 IAH DFW 224
## 5425 45 -9 1 60 IAH DFW 224
## 5426 48 -8 -8 70 IAH DFW 224
## 5427 39 3 3 70 IAH DFW 224
## 5428 44 -3 5 62 IAH DFW 224
## 5429 45 -7 -1 64 IAH DFW 224
## mi_per_min
## 5424 3.733333
## 5425 3.733333
## 5426 3.200000
## 5427 3.200000
## 5428 3.612903
## 5429 3.500000
qplot(Distance, ActualElapsedTime, color=DayOfWeek, data=myflights, main='Total Flight Time by Distance')
## Warning: Removed 3622 rows containing missing values (geom_point).
qplot(DepDelay, data=myflights, geom='histogram', binwidth = 5, xlim =c(-25, 200), main='Departure Delays by Day of Week') + facet_wrap(~DayOfWeek)
## Warning: Removed 3681 rows containing non-finite values (stat_bin).
## Warning: Removed 14 rows containing missing values (geom_bar).
ggplot(data=myflights) + geom_bar(aes(x=UniqueCarrier), bin=5) + ggtitle('Flights by Airline')
## Warning: Ignoring unknown parameters: bin
dests <- ggplot(data=myflights) + geom_bar(aes(x=Dest))
dests + theme(axis.text.x=element_text(angle=-90)) + ggtitle('Flights by Destination')
dplyr Here, I use table
to select the Destination column, and turn that into a data frame. I can
then subset destinations with 5000 flights or more.top_dests <- data.frame(table(myflights$Dest))
top_dests <- subset(top_dests, Freq > 5000)
head(top_dests)
## Var1 Freq
## 7 ATL 7886
## 8 AUS 5022
## 29 DAL 9820
## 32 DEN 5920
## 33 DFW 6653
## 60 LAX 6064
most_dest <- filter(myflights, Dest == 'ATL' | Dest == 'AUS' | Dest == 'DAL' | Dest == 'DEN' | Dest == 'DFW' | Dest == 'LAX' |
Dest == 'MSY' | Dest == 'ORD' | Dest == 'PHX')
qplot(mi_per_min, data=most_dest, geom='density', color=UniqueCarrier) + facet_grid(Dest ~.)