NOTE:
SOURCE CODE is the source code aera, which you can skip if you stick more to my findings in BOLD.This report contains three basic topics:
The dataset with selected columns was downloaded from http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time, under the assignment of Sportify Data Scientist position application, filtered by the interval of December of 2015.
The following table demonstrates the first twenty entries of the dataset:
Flights <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/807104048_T_ONTIME.csv", stringsAsFactors=FALSE);
library(DT);
datatable(Flights[1:20,], options = list(pageLength = 5), fillContainer=T)
The dataset has 479230 rows and 23 columns.
Let’s focus on the flight data, especially, the CRS_DEP_TIME, ARR_TIME, ACTUAL_ELAPSED_TIME, DISTANCE and FL_DATE columns:
Flights by Departure and Arrival Time
library(htmlwidgets);
library(highcharter);
library(plyr);
library(dplyr);
options(digits=4);
FlightCountDep=count(Flights, CRS_DEP_TIME);
MorAftNigDep=data.frame(Period=c("Morning","Afternoon","Night"),
FlightCount=c(nrow(subset(Flights, CRS_DEP_TIME>600 & CRS_DEP_TIME<=1200)),
nrow(subset(Flights, CRS_DEP_TIME>1200 & CRS_DEP_TIME<=1800)),
nrow(subset(Flights, (CRS_DEP_TIME>1800 & CRS_DEP_TIME<=2359) | (CRS_DEP_TIME>=0 & CRS_DEP_TIME<=600)))));
highchart() %>%
hc_title(text = "US Flight Count by Departure Time in Dec, 2015") %>%
hc_add_series_labels_values(FlightCountDep$CRS_DEP_TIME, FlightCountDep$n, name = "Flight Count By Departure Time",
colorByPoint = F, type = "column") %>%
hc_add_series_labels_values(MorAftNigDep$Period, MorAftNigDep$FlightCount/nrow(Flights), type = "pie",
name = "Depature Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_yAxis(title = list(text = "Flight Count by Departure Time")) %>%
hc_xAxis(categories = FlightCountDep$CRS_DEP_TIME) %>%
hc_tooltip(valueDecimals=4)%>%
hc_legend(enabled = FALSE)
This figure shows that:
MorAftNigArr=data.frame(Period=c("Morning","Afternoon","Night"),
FlightCount=c(nrow(subset(Flights, ARR_TIME>600 & ARR_TIME<=1200)),
nrow(subset(Flights, ARR_TIME>1200 & ARR_TIME<=1800)),
nrow(subset(Flights, (ARR_TIME>1800 & ARR_TIME<=2359) | (ARR_TIME>=0 & ARR_TIME<=600)))));
FlightCountArr=count(Flights, ARR_TIME);
highchart() %>%
hc_title(text = "US Flight Count by Arrival Time in Dec, 2015") %>%
hc_add_series_labels_values(FlightCountArr$ARR_TIME, FlightCountArr$n, name = "Flight Count By Arrival Time",
colorByPoint = F, type = "column") %>%
hc_add_series_labels_values(MorAftNigArr$Period, MorAftNigArr$FlightCount/nrow(Flights), type = "pie",
name = "Arrival Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_yAxis(title = list(text = "Flight Count by Arrival Time")) %>%
hc_xAxis(categories = FlightCountArr$ARR_TIME) %>%
hc_tooltip(valueDecimals=4)%>%
hc_legend(enabled = FALSE)
This figure shows that:
Flights By Date
FlightCountDate=count(Flights, FL_DATE);
FlightCountDate=FlightCountDate[order(as.Date(FlightCountDate$FL_DATE)),];
highchart() %>%
hc_title(text='Flights Count By Date in Dec, 2015')%>%
hc_xAxis(categories=FlightCountDate$FL_DATE) %>%
hc_add_series(data=FlightCountDate$n, type='column', colorByPoint=F, name="Flight Count By Date")
This figure shows that
Flights By Distance
FlightCountDis=count(Flights, DISTANCE);
DistCount=data.frame(Distance=c("Shorter than 500 km","Between 500 and 1000 km", "Between 1000 and 1500 km", "Between 1500 and 2000 km", "Between 2000 and 2500 km", "Between 2500 and 3000 km","Longer than 3000 km"),
FlightCount=c(nrow(subset(Flights, DISTANCE<=500)),
nrow(subset(Flights, DISTANCE>500 & DISTANCE<=1000)),
nrow(subset(Flights, DISTANCE>1000 & DISTANCE<=1500)),
nrow(subset(Flights, DISTANCE>1500 & DISTANCE<=2000)),
nrow(subset(Flights, DISTANCE>2000 & DISTANCE<=2500)),
nrow(subset(Flights, DISTANCE>2500 & DISTANCE<=3000)),
nrow(subset(Flights, DISTANCE>3000))));
highchart() %>%
hc_title(text='US Flight Distance Distribution in Dec, 2015')%>%
hc_xAxis(categories=FlightCountDis$DISTANCE) %>%
hc_add_series(data=FlightCountDis$n, type='column',colorByPoint = F, name="Flight Count By Distance") %>%
hc_add_series_labels_values(DistCount$Distance, DistCount$FlightCount/nrow(Flights), type = "pie",
name = "Distance Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))%>%
hc_tooltip(valueDecimals=4)
This figure shows that :
Flights By Flying Time
FlightCountEla=count(Flights, ACTUAL_ELAPSED_TIME);
FlightCountEla=na.omit(FlightCountEla);
ElpTime=data.frame(Elaped=c("Less than 1 hour","Between 1 and 2 hours", "Between 2 and 3 hours","Between 3 and 4 hours","Between 4 and 5 hours", "More than 5 hours"),
FlightCount=c(nrow(subset(Flights, ACTUAL_ELAPSED_TIME<=60)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>60 & ACTUAL_ELAPSED_TIME<=120)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>120 & ACTUAL_ELAPSED_TIME<=180)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>180 & ACTUAL_ELAPSED_TIME<=240)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>240 & ACTUAL_ELAPSED_TIME<=300)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>300))));
highchart() %>%
hc_title(text='US Flight Count by Time Elapsed in Dec, 2015')%>%
hc_xAxis(categories=FlightCountEla$ACTUAL_ELAPSED_TIME) %>%
hc_add_series(data=FlightCountEla$n, type='column', colorByPoint = F, name="Flight Count By Flying Time") %>%
hc_add_series_labels_values(ElpTime$Elaped, ElpTime$FlightCount/nrow(Flights), type = "pie",
name = "Elapsing Time Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))%>%
hc_tooltip(valueDecimals=4)
This figure shows that:
Thare are 13 airlines providing services in Dec, 2015.
Operating Carriers
CarrierCount=count(Flights, UNIQUE_CARRIER);
carriers <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/carriers.csv", header=FALSE, stringsAsFactors=FALSE);
CarrierCountFN=merge(CarrierCount, carriers, by.x='UNIQUE_CARRIER', by.y='V1');
FlightsFN=merge(Flights, carriers, by.x='UNIQUE_CARRIER', by.y='V1');
DisCarrier=ddply(FlightsFN, .(V2), summarise, sumDis=sum(DISTANCE));
highchart() %>%
hc_title(text='US Flight Distance By Carriers in Dec, 2015')%>%
hc_xAxis(categories=FlightsFN$V2) %>%
hc_add_series_boxplot(x=FlightsFN$DISTANCE, by=FlightsFN$V2, colorByPoint = T, name="Flight Distance By Carriers", outliers=F)%>%
hc_add_series_labels_values(DisCarrier$V2, DisCarrier$sumDis, type='pie', name = "Total Distance in km",
colorByPoint = TRUE, center = c('22%', '25%'), size = 150, dataLabels = list(enabled = FALSE))
This figure (outliers in the boxplot are actually skipped) reveals the diversity of each airline in terms of the flight distance. It can be seen that:
How busy are these carriers? Let’s find out the flights of each day:
FlightsDaily=count(FlightsFN, V2, date=FL_DATE);
FlightsDaily$date=as.Date(FlightsDaily$date);
highchart() %>%
hc_title(text='Daily US Flight Count of Each Carrier In Dec, 2015') %>%
hc_xAxis(categories=FlightsDaily$date) %>%
hc_add_series_df(FlightsDaily, y=n, type="line", group=V2) %>%
hc_legend(layout = "vertical", align = "right")%>%
hc_add_series_labels_values(CarrierCountFN$V2, CarrierCountFN$n, type = "pie", name = "Flight Count in Dec, 2015",
center = c('125%', '25%'), size = 200, dataLabels = list(enabled = FALSE))
We can conclude that:
Distinctive Flights
Let’s see how many distinctive flight routes from all airlines. We assume that the flights operated by the same carrier, with the identical flight number, from the same origin city to the same destination, are regarded as the same flight. (This may violate the situtation that two carriers share the FL_NUM. Practically, two flights of this kind are the same flight. But under our assumption, they are two flights.)
Some flights are operated daily, but others weekly. The flight frequency is as shown in the following figure.
DistinctFlight=count(Flights, UNIQUE_CARRIER, FL_NUM, ORIGIN_CITY_NAME, DEST_CITY_NAME);
FlightsByCarriers=count(DistinctFlight[1], UNIQUE_CARRIER);
DistinctFlightCount=count(DistinctFlight[5], n);
highchart() %>%
hc_title(text='Distinctive US Flight Count Distribution in Dec, 2015')%>%
hc_add_series_labels_values(DistinctFlightCount$n, DistinctFlightCount$nn, name = "Flight Count",
colorByPoint =F, type = "column") %>%
hc_add_series_labels_values(FlightsByCarriers$UNIQUE_CARRIER, FlightsByCarriers$n, type = "pie",
name = "Distinctive Flights Count", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))
There are 44416 distinctive flights operated in Dec, 2015, of which:
Cancellation Rate
FlightsCancelled=ddply(FlightsFN, .(V2,FL_DATE), summarise,cancelledNo=sum(CANCELLED));
FlightsCancelled$FL_DATE=as.Date(FlightsCancelled$FL_DATE);
FlightsCancelledData=merge(FlightsCancelled, FlightsDaily, by.x=c('V2', 'FL_DATE'), by.y=c('V2','date'));
FlightsCancelledData=mutate(FlightsCancelledData, CancelRate=cancelledNo/n);
CancelContribution=ddply(FlightsCancelledData, .(V2), summarise,EachSum=sum(cancelledNo), Total=sum(n));
highchart() %>%
hc_xAxis(categories=FlightsCancelledData$FL_DATE) %>%
hc_add_series_df(data=FlightsCancelledData, y=CancelRate, type="line", group = V2) %>%
hc_add_series_labels_values(CancelContribution$V2, CancelContribution$EachSum, type = "pie",
name = "Cancelled Flights Contribution", center = c('125%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_legend(layout = "vertical", align = "right") %>%
hc_title(text='Flight Cancellation of Each Carrier in Dec, 2015')
From this figure, we can see:
Inter-City Conveniency
To see which combination of two cities are provided with most flights, we find out all cities linked by distinctive flights.
CityFlight=count(DistinctFlight[3:4], ORIGIN_CITY_NAME, DEST_CITY_NAME);
There are 3803 city combinations served by all flights. These flights cover 303 cities as origins and 303 cities as destinations. Their airports can be located as below:
library(httr);
library(geojsonio);
map <- "https://raw.githubusercontent.com/johan/world.geo.json/master/countries/USA.geo.json" %>%
GET() %>%
content() %>%
jsonlite::fromJSON(simplifyVector = FALSE);
airports <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/airports.csv", stringsAsFactors=FALSE);
OriCity=count(Flights, ORIGIN);
OriCity=merge(OriCity, airports[c(1,6,7)], by.x='ORIGIN', by.y='iata');
OriCity=unique(merge(OriCity,Flights[8:9], by='ORIGIN'));
airpjsonOri <- geojson_json(OriCity, lat = "lat", lon = "long");
highchart(type='map') %>%
hc_title(text = "Airports Scattered in US") %>%
hc_add_series(mapData = map, showInLegend = F, nullColor = "#A9CF54", borderColor="#A9CF54") %>%
hc_add_series(data=airpjsonOri, type="mappoint", dataLabels = list(enabled = F), name = "Airports", color = '#4F4F4F',
tooltip = list(pointFormat = "{point.properties.ORIGIN_CITY_NAME}:{point.properties.n} depature flights")) %>%
hc_mapNavigation(enabled = T)
These 307 airports scatter in the 303 cities of mainland, Alaska, Hawaii and oversea territories, including Guam, Virgin Islands and Puerto Rico.
So obviously, some cities has more than one airports. See this table:
airpC=count(FlightsFN, city=ORIGIN_CITY_NAME, airport=ORIGIN);
datatable(airpC, options = list(pageLength = 5))
From this table, we can find that:
Transfering Capability
We can explore the departure and arrival flights for each airport. And also, D/A imbalance rate can be defined as the result of departure flight count devided by arrival flight count.
OriCity=count(Flights, ORIGIN);
OriCity=unique(merge(OriCity,Flights[8:9], by='ORIGIN'));
DesCity=count(Flights, DEST);
DesCity=unique(merge(DesCity,Flights[13:14], by='DEST'));
EndCity=merge(OriCity,DesCity, by.x='ORIGIN', by.y='DEST');
highchart() %>%
hc_xAxis(categories=EndCity$ORIGIN)%>%
hc_yAxis_multiples(
list(title = list(text = "D/A Imbalance Rate"), align = "right", showFirstLabel = FALSE),
list(title = list(text = "Distinctive Flight Count"), align = "left",showFirstLabel = FALSE,opposite = TRUE)) %>%
hc_add_series(data=EndCity$n.x, colorByPoint = F, name = "Departure Flights", type='column', yAxis=1)%>%
hc_add_series(data=EndCity$n.y, colorByPoint = F, name = "Arrival Flights", type='column', yAxis=1) %>%
hc_add_series(data=EndCity$n.x/EndCity$n.y, name = "Imbalance Rate", type='line') %>%
hc_title(text='Distinctive Flights Count and Imbalance Rate for Each US Airport')%>%
hc_tooltip(valueDecimals=4)
As shown above:
Daily Flights of Each Airport
library(viridisLite);
AirpOriDate=count(Flights, FL_DATE, ORIGIN);
AirpDesDate=count(Flights, FL_DATE, DEST);
AirpDate=merge(AirpOriDate, AirpDesDate, by.x=c('ORIGIN', 'FL_DATE'), by.y=c('DEST', 'FL_DATE'));
AirpDate=mutate(AirpDate, throughput=n.x+n.y);
stpscol <- color_stops(10, viridis(20));
fntltp <- JS("function(){return this.point.ORIGIN + ' in ' + this.series.yAxis.categories[this.point.y] + ':<br>' + Highcharts.numberFormat(this.point.value, 0) + ' flights';}");
hchart(AirpDate, "heatmap", y = FL_DATE, x = ORIGIN, value = throughput, name='Airport Flight Throughput') %>%
hc_colorAxis(stops = stpscol, type = "logarithmic") %>%
hc_title(text = "Daily Throughput of Each Airport in Dec, 2015") %>%
hc_tooltip(formatter = fntltp) %>%
hc_legend(layout = "vertical", verticalAlign = "bottom", align = "right")
This figure gives the holistic idea about how many flights are operated by each airport everyday.
We can construct the adjacency matrix for each pair of two cities. It shoud be a 303 * 303 matrix. Hence in the matrix, 0 value indicates no flight from ORIGIN to DEST, and 1 value for at least one flight. Due to the flight route is directed, the matrix is not necessarily symmetric.
CityList=intersect(CityFlight$ORIGIN_CITY_NAME,CityFlight$DEST_CITY_NAME);
AdjMatrix=matrix(0, nrow=length(unique(CityFlight$ORIGIN_CITY_NAME)), ncol=length(unique(CityFlight$DEST_CITY_NAME)));
colnames(AdjMatrix)=c(CityList);
rownames(AdjMatrix)=c(CityList);
for (i in 1 : nrow(AdjMatrix)){
for (j in 1: ncol(AdjMatrix)){
if (is.element(CityList[j], CityFlight$DEST_CITY_NAME[CityFlight[,1]==CityList[i]])==T) {AdjMatrix[i, j]=1}
else {AdjMatrix[i, j]=0}}};
fntltp <- JS("function(){return 'DEST: ' + this.point.name + ' as ORIGIN: ' + this.point.value}");
hchart(AdjMatrix) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by Flight')
This data presents the possibility of travelling between two US cities with no-transfer flights. As shown above, existing flights covers 0.0414 of all no-stop flight requirements. If one transfer is allowed, let’s see how many cities will be connected. To do this, we employ the reachability theory.
AdjMatrix1Trans=AdjMatrix%*%AdjMatrix;
AdjMatrix1Transfer=AdjMatrix+AdjMatrix1Trans;
BooleanM=function(x){
for (i in 1 : nrow(x)){
for (j in 1 : ncol(x)){
if (x[i,j]>0){x[i,j]=1}
else {x[i,j]=x[i,j]}}}
return(x)};
AdjMatrix1Transfer=BooleanM(AdjMatrix1Transfer);
hchart(AdjMatrix1Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by One-Transfer Flight')
These flights cover 0.6 of all flights requirements between two arbitrary cities with at most one transfer. Let’s make two transfers.
AdjMatrix2Trans=AdjMatrix1Trans%*%AdjMatrix;
AdjMatrix2Transfer=AdjMatrix1Transfer+AdjMatrix2Trans;
AdjMatrix2Transfer=BooleanM(AdjMatrix2Transfer);
hchart(AdjMatrix2Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by Two-Transfer Flight')
88211 fligts with at most two transfers are provided. 0.9608 of all requirements are met. There are still some cities out of flight reach. Three transfers may make all cities connected in dual directions.
AdjMatrix3Trans=AdjMatrix2Trans%*%AdjMatrix;
AdjMatrix3Transfer=AdjMatrix2Transfer+AdjMatrix3Trans;
AdjMatrix3Transfer=BooleanM(AdjMatrix3Transfer);
hchart(AdjMatrix3Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two Cities by Three-Transfer Flight')
0.9997 of all requirements are met. Four-transfer flight are for these black naughty combinations.
AdjMatrix4Trans=AdjMatrix3Trans%*%AdjMatrix;
AdjMatrix4Transfer=AdjMatrix3Transfer+AdjMatrix4Trans;
AdjMatrix4Transfer=BooleanM(AdjMatrix4Transfer);
The new matrix provides 91809 flights which covers all flights requirements between two cities. The figure has been omitted. Finally,
Fewest-Transfer Route
We can also find the fewest-transfer flight route for any two cities by applying Dijkstra Algorithm on the adjacency matrix.
library(edgebundleR);
library(igraph);
edgeVector=NULL;
for(i in 1 : nrow(CityFlight)){edgeVector=c(edgeVector, CityFlight[i,1:2])};
CityLink=make_graph(unlist(edgeVector), directed = T);
TransferCount=shortest.paths(CityLink, V(CityLink),V(CityLink), algorithm='dijkstra')
From TransferCount matrix, we can quickly find out the fewest transfer times between two cities. Furthermore, we can use get.shortest.paths to obtain one shortest path of two cities. For example,
get.shortest.paths(CityLink, "Hattiesburg/Laurel, MS", "Wrangell, AK")$vpath
## [[1]]
## + 5/303 vertices, named:
## [1] Hattiesburg/Laurel, MS Dallas/Fort Worth, TX Seattle, WA
## [4] Ketchikan, AK Wrangell, AK
presented by Bo Wei, and powered by R and htmlwidgets.