knitr::opts_chunk$set(echo =T,warning = F,message=F,tidy=T)
The report was originally created on 2017-02-24 with RStudio 3.3.2, using dataset from OpenFlights.org.
Load Route Mapper Database Directly from the Website)
url <- "https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat"
flights <- read.table(url, header = F, sep = ",")
names(flights) <- c("Airline", "Airline ID", "Source ", "Source ID", "Destination ",
"Destination ID", "Codeshare", "Stops", "Equipment")
head(flights)
## Airline Airline ID Source Source ID Destination Destination ID
## 1 2B 410 AER 2965 KZN 2990
## 2 2B 410 ASF 2966 KZN 2990
## 3 2B 410 ASF 2966 MRV 2962
## 4 2B 410 CEK 2968 KZN 2990
## 5 2B 410 CEK 2968 OVB 4078
## 6 2B 410 DME 4029 KZN 2990
## Codeshare Stops Equipment
## 1 0 CR2
## 2 0 CR2
## 3 0 CR2
## 4 0 CR2
## 5 0 CR2
## 6 0 CR2
str(flights)
## 'data.frame': 67663 obs. of 9 variables:
## $ Airline : Factor w/ 568 levels "2B","2G","2I",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Airline ID : Factor w/ 548 levels "\\N","10","1006",..: 382 382 382 382 382 382 382 382 382 382 ...
## $ Source : Factor w/ 3409 levels "AAE","AAL","AAN",..: 50 152 152 472 472 699 699 699 699 771 ...
## $ Source ID : Factor w/ 3321 levels "\\N","1","10",..: 1116 1117 1117 1119 1119 1755 1755 1755 1755 2517 ...
## $ Destination : Factor w/ 3418 levels "AAE","AAL","AAN",..: 1557 1557 1901 1557 2154 1557 1979 2785 3001 1403 ...
## $ Destination ID: Factor w/ 3327 levels "\\N","1","10",..: 1138 1138 1115 1138 1792 1138 2868 1 2521 1106 ...
## $ Codeshare : Factor w/ 2 levels "","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ Stops : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Equipment : Factor w/ 3946 levels ""," 73W 733 73C",..: 2919 2919 2919 2919 2919 2919 2919 2919 2919 2919 ...
## Change Levels of Factor of Codeshare with missing vlaue
level <- levels(flights$Codeshare)
library(plyr)
mapvalues(level, from = c("", "Y"), to = c(0, 1))
## [1] "0" "1"
There are 67663 observations with 9 variables, the varibale names are Airline, Airline ID, Source , Source ID, Destination , Destination ID, Codeshare, Stops, Equipment.
Visualzie and Summarize the original flights Dataset. (Jump to Part1)
library(dplyr)
library(plyr)
summary(flights)
## Airline Airline ID Source Source ID
## FR : 2484 4296 : 2484 ATL : 915 3682 : 915
## AA : 2354 24 : 2354 ORD : 558 3830 : 558
## UA : 2180 5209 : 2180 PEK : 535 3364 : 535
## DL : 1981 2009 : 1981 LHR : 527 507 : 527
## US : 1960 5265 : 1960 CDG : 524 1382 : 524
## CZ : 1454 1767 : 1454 FRA : 497 340 : 497
## (Other):55250 (Other):55250 (Other):64107 (Other):64107
## Destination Destination ID Codeshare Stops
## ATL : 911 3682 : 911 :53066 Min. :0.0000000
## ORD : 550 3830 : 550 Y:14597 1st Qu.:0.0000000
## PEK : 534 3364 : 534 Median :0.0000000
## LHR : 524 507 : 524 Mean :0.0001626
## CDG : 517 1382 : 517 3rd Qu.:0.0000000
## LAX : 498 3484 : 498 Max. :1.0000000
## (Other):64129 (Other):64129
## Equipment
## 320 : 9180
## 738 : 7124
## 319 : 3420
## 737 : 2211
## 73H : 2043
## CRJ : 1238
## (Other):42447
(flights <- tbl_df(flights))
## # A tibble: 67,663 x 9
## Airline `Airline ID` `Source ` `Source ID` `Destination `
## <fctr> <fctr> <fctr> <fctr> <fctr>
## 1 2B 410 AER 2965 KZN
## 2 2B 410 ASF 2966 KZN
## 3 2B 410 ASF 2966 MRV
## 4 2B 410 CEK 2968 KZN
## 5 2B 410 CEK 2968 OVB
## 6 2B 410 DME 4029 KZN
## 7 2B 410 DME 4029 NBC
## 8 2B 410 DME 4029 TGK
## 9 2B 410 DME 4029 UUA
## 10 2B 410 EGO 6156 KGD
## # ... with 67,653 more rows, and 4 more variables: `Destination
## # ID` <fctr>, Codeshare <fctr>, Stops <int>, Equipment <fctr>
Creat a subset that includes only American Airlines flights (Jump to Part2)
library(dplyr)
library(tidyr)
### Select American Airline Flights
aa <- flights %>% filter(Airline == "AA")
head(aa, 20)
## # A tibble: 20 x 9
## Airline `Airline ID` `Source ` `Source ID` `Destination `
## <fctr> <fctr> <fctr> <fctr> <fctr>
## 1 AA 24 ABE 4355 CLT
## 2 AA 24 ABE 4355 PHL
## 3 AA 24 ABI 3718 DFW
## 4 AA 24 ABQ 4019 DFW
## 5 AA 24 ABQ 4019 LAX
## 6 AA 24 ABQ 4019 ORD
## 7 AA 24 ABQ 4019 PHX
## 8 AA 24 ABZ 532 LHR
## 9 AA 24 ACC 248 LHR
## 10 AA 24 ACT 3700 DFW
## 11 AA 24 ADL 3341 BNE
## 12 AA 24 ADL 3341 SYD
## 13 AA 24 AEX 3852 DFW
## 14 AA 24 AGP 1230 LHR
## 15 AA 24 AGS 3658 CLT
## 16 AA 24 AGS 3658 DCA
## 17 AA 24 AGU 1785 DFW
## 18 AA 24 AKL 2006 BNE
## 19 AA 24 AKL 2006 PPT
## 20 AA 24 AKL 2006 SYD
## # ... with 4 more variables: `Destination ID` <fctr>, Codeshare <fctr>,
## # Stops <int>, Equipment <fctr>
Show The Number of Fleet Types in American Airlines (Jump to Part3)
### Count AA airlines fleet type number
aa_fleet <- aa %>% group_by(Equipment) %>% tally(sort = T)
head(aa_fleet, 10)
## # A tibble: 10 x 2
## Equipment n
## <fctr> <int>
## 1 738 209
## 2 737 146
## 3 CRJ 144
## 4 319 109
## 5 ER4 94
## 6 763 76
## 7 777 76
## 8 757 73
## 9 ER4 ERD 68
## 10 DH3 61
Quick Summary of AA Fleet:
There are in total 292 operating fleets, with the top 3 most popular fleets are 738,737 and CRJ
Creat a subset that only includes the top 5 most popular fleet type (Jump to Part4)
library(ggplot2)
library(ggvis)
## List top 5 fleet type head(summary(aa$Equipment),5)
## Select Top 5 Most Popular Fleet Type could use %in% in filter
aa_sub <- filter(aa, Equipment %in% c("ER4", 737, 738, "CRJ", 319))
### Plot
g <- ggplot(as.data.frame(aa_sub), aes(x = Equipment))
g <- g + geom_bar(aes(fill = ..count..)) + scale_fill_gradient("Count", low = "blue",
high = "red")
g <- g + scale_x_discrete(labels = c("ER4", 737, 738, "CRJ", 319)) + ggtitle(" Top 5 Most Popular Fleets in American Airlines")
g
## Further breakdwon of flights by Codeshare
(g1 <- g + facet_grid(. ~ aa_sub$Codeshare) + ggtitle("Breakdown of Top 5 Fleet by Codeshare Agreement(Y)"))
## Investigate Destiantion of these 5 most popular fleet type
## head(summary(aa_sub$`Destination `))
Visualize Fleet Type Group by Destination in AA Airline (Jump to Part5)
aa_dest <- filter(aa_sub, aa_sub$`Destination ` %in% c("MIA", "DFW", "DCA",
"LAX", "ORD", "CLT", "JFK", "SEA", "PHL"))
g2 <- ggplot(aa_dest, aes(x = aa_dest$Equipment))
g2 <- g2 + stat_count(geom = "bar", aes(fill = ..count..), position = "stack")
g2 <- g2 + facet_grid(. ~ aa_dest$`Destination `) + xlab("Fleet Type") + ggtitle(" AA Airline Fleet Type by Destination")
g2