knitr::opts_chunk$set(echo =T,warning = F,message=F,tidy=T)

The report was originally created on 2017-02-24 with RStudio 3.3.2, using dataset from OpenFlights.org.

Part1

Load Route Mapper Database Directly from the Website)

url <- "https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat"
flights <- read.table(url, header = F, sep = ",")
names(flights) <- c("Airline", "Airline ID", "Source ", "Source ID", "Destination ", 
    "Destination ID", "Codeshare", "Stops", "Equipment")
head(flights)
##   Airline Airline ID Source  Source ID Destination  Destination ID
## 1      2B        410     AER      2965          KZN           2990
## 2      2B        410     ASF      2966          KZN           2990
## 3      2B        410     ASF      2966          MRV           2962
## 4      2B        410     CEK      2968          KZN           2990
## 5      2B        410     CEK      2968          OVB           4078
## 6      2B        410     DME      4029          KZN           2990
##   Codeshare Stops Equipment
## 1               0       CR2
## 2               0       CR2
## 3               0       CR2
## 4               0       CR2
## 5               0       CR2
## 6               0       CR2
str(flights)
## 'data.frame':    67663 obs. of  9 variables:
##  $ Airline       : Factor w/ 568 levels "2B","2G","2I",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Airline ID    : Factor w/ 548 levels "\\N","10","1006",..: 382 382 382 382 382 382 382 382 382 382 ...
##  $ Source        : Factor w/ 3409 levels "AAE","AAL","AAN",..: 50 152 152 472 472 699 699 699 699 771 ...
##  $ Source ID     : Factor w/ 3321 levels "\\N","1","10",..: 1116 1117 1117 1119 1119 1755 1755 1755 1755 2517 ...
##  $ Destination   : Factor w/ 3418 levels "AAE","AAL","AAN",..: 1557 1557 1901 1557 2154 1557 1979 2785 3001 1403 ...
##  $ Destination ID: Factor w/ 3327 levels "\\N","1","10",..: 1138 1138 1115 1138 1792 1138 2868 1 2521 1106 ...
##  $ Codeshare     : Factor w/ 2 levels "","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Stops         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Equipment     : Factor w/ 3946 levels ""," 73W 733 73C",..: 2919 2919 2919 2919 2919 2919 2919 2919 2919 2919 ...
## Change Levels of Factor of Codeshare with missing vlaue
level <- levels(flights$Codeshare)
library(plyr)
mapvalues(level, from = c("", "Y"), to = c(0, 1))
## [1] "0" "1"

There are 67663 observations with 9 variables, the varibale names are Airline, Airline ID, Source , Source ID, Destination , Destination ID, Codeshare, Stops, Equipment.

Part2

Visualzie and Summarize the original flights Dataset. (Jump to Part1)

library(dplyr)
library(plyr)
summary(flights)
##     Airline        Airline ID       Source         Source ID    
##  FR     : 2484   4296   : 2484   ATL    :  915   3682   :  915  
##  AA     : 2354   24     : 2354   ORD    :  558   3830   :  558  
##  UA     : 2180   5209   : 2180   PEK    :  535   3364   :  535  
##  DL     : 1981   2009   : 1981   LHR    :  527   507    :  527  
##  US     : 1960   5265   : 1960   CDG    :  524   1382   :  524  
##  CZ     : 1454   1767   : 1454   FRA    :  497   340    :  497  
##  (Other):55250   (Other):55250   (Other):64107   (Other):64107  
##   Destination    Destination ID  Codeshare     Stops          
##  ATL    :  911   3682   :  911    :53066   Min.   :0.0000000  
##  ORD    :  550   3830   :  550   Y:14597   1st Qu.:0.0000000  
##  PEK    :  534   3364   :  534             Median :0.0000000  
##  LHR    :  524   507    :  524             Mean   :0.0001626  
##  CDG    :  517   1382   :  517             3rd Qu.:0.0000000  
##  LAX    :  498   3484   :  498             Max.   :1.0000000  
##  (Other):64129   (Other):64129                                
##    Equipment    
##  320    : 9180  
##  738    : 7124  
##  319    : 3420  
##  737    : 2211  
##  73H    : 2043  
##  CRJ    : 1238  
##  (Other):42447
(flights <- tbl_df(flights))
## # A tibble: 67,663 x 9
##    Airline `Airline ID` `Source ` `Source ID` `Destination `
##     <fctr>       <fctr>    <fctr>      <fctr>         <fctr>
##  1      2B          410       AER        2965            KZN
##  2      2B          410       ASF        2966            KZN
##  3      2B          410       ASF        2966            MRV
##  4      2B          410       CEK        2968            KZN
##  5      2B          410       CEK        2968            OVB
##  6      2B          410       DME        4029            KZN
##  7      2B          410       DME        4029            NBC
##  8      2B          410       DME        4029            TGK
##  9      2B          410       DME        4029            UUA
## 10      2B          410       EGO        6156            KGD
## # ... with 67,653 more rows, and 4 more variables: `Destination
## #   ID` <fctr>, Codeshare <fctr>, Stops <int>, Equipment <fctr>

Part3

Creat a subset that includes only American Airlines flights (Jump to Part2)

library(dplyr)
library(tidyr)

### Select American Airline Flights
aa <- flights %>% filter(Airline == "AA")
head(aa, 20)
## # A tibble: 20 x 9
##    Airline `Airline ID` `Source ` `Source ID` `Destination `
##     <fctr>       <fctr>    <fctr>      <fctr>         <fctr>
##  1      AA           24       ABE        4355            CLT
##  2      AA           24       ABE        4355            PHL
##  3      AA           24       ABI        3718            DFW
##  4      AA           24       ABQ        4019            DFW
##  5      AA           24       ABQ        4019            LAX
##  6      AA           24       ABQ        4019            ORD
##  7      AA           24       ABQ        4019            PHX
##  8      AA           24       ABZ         532            LHR
##  9      AA           24       ACC         248            LHR
## 10      AA           24       ACT        3700            DFW
## 11      AA           24       ADL        3341            BNE
## 12      AA           24       ADL        3341            SYD
## 13      AA           24       AEX        3852            DFW
## 14      AA           24       AGP        1230            LHR
## 15      AA           24       AGS        3658            CLT
## 16      AA           24       AGS        3658            DCA
## 17      AA           24       AGU        1785            DFW
## 18      AA           24       AKL        2006            BNE
## 19      AA           24       AKL        2006            PPT
## 20      AA           24       AKL        2006            SYD
## # ... with 4 more variables: `Destination ID` <fctr>, Codeshare <fctr>,
## #   Stops <int>, Equipment <fctr>

Part4

Show The Number of Fleet Types in American Airlines (Jump to Part3)

### Count AA airlines fleet type number
aa_fleet <- aa %>% group_by(Equipment) %>% tally(sort = T)
head(aa_fleet, 10)
## # A tibble: 10 x 2
##    Equipment     n
##       <fctr> <int>
##  1       738   209
##  2       737   146
##  3       CRJ   144
##  4       319   109
##  5       ER4    94
##  6       763    76
##  7       777    76
##  8       757    73
##  9   ER4 ERD    68
## 10       DH3    61

Quick Summary of AA Fleet:
There are in total 292 operating fleets, with the top 3 most popular fleets are 738,737 and CRJ

Part5

Creat a subset that only includes the top 5 most popular fleet type (Jump to Part4)

library(ggplot2)
library(ggvis)
## List top 5 fleet type head(summary(aa$Equipment),5)

## Select Top 5 Most Popular Fleet Type could use %in% in filter
aa_sub <- filter(aa, Equipment %in% c("ER4", 737, 738, "CRJ", 319))

### Plot
g <- ggplot(as.data.frame(aa_sub), aes(x = Equipment))
g <- g + geom_bar(aes(fill = ..count..)) + scale_fill_gradient("Count", low = "blue", 
    high = "red")
g <- g + scale_x_discrete(labels = c("ER4", 737, 738, "CRJ", 319)) + ggtitle("       Top 5 Most Popular Fleets in American Airlines")
g

## Further breakdwon of flights by Codeshare
(g1 <- g + facet_grid(. ~ aa_sub$Codeshare) + ggtitle("Breakdown of Top 5 Fleet by Codeshare Agreement(Y)"))

## Investigate Destiantion of these 5 most popular fleet type
## head(summary(aa_sub$`Destination `))

Part6

Visualize Fleet Type Group by Destination in AA Airline (Jump to Part5)

aa_dest <- filter(aa_sub, aa_sub$`Destination ` %in% c("MIA", "DFW", "DCA", 
    "LAX", "ORD", "CLT", "JFK", "SEA", "PHL"))
g2 <- ggplot(aa_dest, aes(x = aa_dest$Equipment))
g2 <- g2 + stat_count(geom = "bar", aes(fill = ..count..), position = "stack")
g2 <- g2 + facet_grid(. ~ aa_dest$`Destination `) + xlab("Fleet Type") + ggtitle("          AA Airline Fleet Type by Destination")
g2