NOTE:
SOURCE CODE is the source code aera, which you can skip if you stick more to my findings in red font.This report contains three basic topics:
The dataset with selected columns was downloaded from http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time, under the assignment of Sportify Data Scientist position application, filtered by the interval of December of 2015.
The following table demonstrates the first twenty entries of the dataset:
Flights <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/807104048_T_ONTIME.csv", stringsAsFactors=FALSE)
library(DT);
datatable(Flights[1:20,], options = list(pageLength = 5), fillContainer=T)
The dataset has 479230 rows and 23 columns.
Let’s focus on the flight data, especially, the CRS_DEP_TIME, ARR_TIME, ACTUAL_ELAPSED_TIME, DISTANCE and FL_DATE columns:
Flights by Departure and Arrival Time
library(htmlwidgets);
library(highcharter);
library(plyr);
library(dplyr);
options(digits=4);
FlightCountDep=count(Flights, CRS_DEP_TIME);
MorAftNigDep=data.frame(Period=c("Morning","Afternoon","Night"),
FlightCount=c(nrow(subset(Flights, CRS_DEP_TIME>600 & CRS_DEP_TIME<=1200)),
nrow(subset(Flights, CRS_DEP_TIME>1200 & CRS_DEP_TIME<=1800)),
nrow(subset(Flights, (CRS_DEP_TIME>1800 & CRS_DEP_TIME<=2359) | (CRS_DEP_TIME>=0 & CRS_DEP_TIME<=600)))));
highchart() %>%
hc_title(text = "US Flight Count by Departure Time in Dec, 2015") %>%
hc_add_series_labels_values(FlightCountDep$CRS_DEP_TIME, FlightCountDep$n, name = "Flight Count By Departure Time",
colorByPoint = F, type = "column") %>%
hc_add_series_labels_values(MorAftNigDep$Period, MorAftNigDep$FlightCount/nrow(Flights), type = "pie",
name = "Depature Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_yAxis(title = list(text = "Flight Count by Departure Time")) %>%
hc_xAxis(categories = FlightCountDep$CRS_DEP_TIME) %>%
hc_tooltip(valueDecimals=4)%>%
hc_legend(enabled = FALSE)
This figure shows that:
MorAftNigArr=data.frame(Period=c("Morning","Afternoon","Night"),
FlightCount=c(nrow(subset(Flights, ARR_TIME>600 & ARR_TIME<=1200)),
nrow(subset(Flights, ARR_TIME>1200 & ARR_TIME<=1800)),
nrow(subset(Flights, (ARR_TIME>1800 & ARR_TIME<=2359) | (ARR_TIME>=0 & ARR_TIME<=600)))));
FlightCountArr=count(Flights, ARR_TIME);
highchart() %>%
hc_title(text = "US Flight Count by Arrival Time in Dec, 2015") %>%
hc_add_series_labels_values(FlightCountArr$ARR_TIME, FlightCountArr$n, name = "Flight Count By Arrival Time",
colorByPoint = F, type = "column") %>%
hc_add_series_labels_values(MorAftNigArr$Period, MorAftNigArr$FlightCount/nrow(Flights), type = "pie",
name = "Arrival Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_yAxis(title = list(text = "Flight Count by Arrival Time")) %>%
hc_xAxis(categories = FlightCountArr$ARR_TIME) %>%
hc_tooltip(valueDecimals=4)%>%
hc_legend(enabled = FALSE)
This figure shows that:
Flights By Date
FlightCountDate=count(Flights, FL_DATE);
FlightCountDate=FlightCountDate[order(as.Date(FlightCountDate$FL_DATE)),];
highchart() %>%
hc_title(text='Flights Count By Date in Dec, 2015')%>%
hc_xAxis(categories=FlightCountDate$FL_DATE) %>%
hc_add_series(data=FlightCountDate$n, type='column', colorByPoint=F, name="Flight Count By Date")
This figure shows that
Flights By Distance
FlightCountDis=count(Flights, DISTANCE);
DistCount=data.frame(Distance=c("Shorter than 500 km","Between 500 and 1000 km", "Between 1000 and 1500 km", "Between 1500 and 2000 km", "Between 2000 and 2500 km", "Between 2500 and 3000 km","Longer than 3000 km"),
FlightCount=c(nrow(subset(Flights, DISTANCE<=500)),
nrow(subset(Flights, DISTANCE>500 & DISTANCE<=1000)),
nrow(subset(Flights, DISTANCE>1000 & DISTANCE<=1500)),
nrow(subset(Flights, DISTANCE>1500 & DISTANCE<=2000)),
nrow(subset(Flights, DISTANCE>2000 & DISTANCE<=2500)),
nrow(subset(Flights, DISTANCE>2500 & DISTANCE<=3000)),
nrow(subset(Flights, DISTANCE>3000))));
highchart() %>%
hc_title(text='US Flight Distance Distribution in Dec, 2015')%>%
hc_xAxis(categories=FlightCountDis$DISTANCE) %>%
hc_add_series(data=FlightCountDis$n, type='column',colorByPoint = F, name="Flight Count By Distance") %>%
hc_add_series_labels_values(DistCount$Distance, DistCount$FlightCount/nrow(Flights), type = "pie",
name = "Distance Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))%>%
hc_tooltip(valueDecimals=4)
This figure shows that :
Flights By Flying Time
FlightCountEla=count(Flights, ACTUAL_ELAPSED_TIME);
FlightCountEla=na.omit(FlightCountEla);
ElpTime=data.frame(Elaped=c("Less than 1 hour","Between 1 and 2 hours", "Between 2 and 3 hours","Between 3 and 4 hours","Between 4 and 5 hours", "More than 5 hours"),
FlightCount=c(nrow(subset(Flights, ACTUAL_ELAPSED_TIME<=60)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>60 & ACTUAL_ELAPSED_TIME<=120)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>120 & ACTUAL_ELAPSED_TIME<=180)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>180 & ACTUAL_ELAPSED_TIME<=240)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>240 & ACTUAL_ELAPSED_TIME<=300)),
nrow(subset(Flights, ACTUAL_ELAPSED_TIME>300))));
highchart() %>%
hc_title(text='US Flight Count by Time Elapsed in Dec, 2015')%>%
hc_xAxis(categories=FlightCountEla$ACTUAL_ELAPSED_TIME) %>%
hc_add_series(data=FlightCountEla$n, type='column', colorByPoint = F, name="Flight Count By Flying Time") %>%
hc_add_series_labels_values(ElpTime$Elaped, ElpTime$FlightCount/nrow(Flights), type = "pie",
name = "Elapsing Time Ratio", colorByPoint = TRUE, center = c('75%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))%>%
hc_tooltip(valueDecimals=4)
This figure shows that:
Thare are 13 airlines providing services in Dec, 2015.
Operating Carriers
CarrierCount=count(Flights, UNIQUE_CARRIER);
carriers <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/carriers.csv", header=FALSE, stringsAsFactors=FALSE);
CarrierCountFN=merge(CarrierCount, carriers, by.x='UNIQUE_CARRIER', by.y='V1');
FlightsFN=merge(Flights, carriers, by.x='UNIQUE_CARRIER', by.y='V1');
DisCarrier=ddply(FlightsFN, .(V2), summarise, sumDis=sum(DISTANCE));
highchart() %>%
hc_title(text='US Flight Distance By Carriers in Dec, 2015')%>%
hc_xAxis(categories=FlightsFN$V2) %>%
hc_add_series_boxplot(x=FlightsFN$DISTANCE, by=FlightsFN$V2, colorByPoint = T, name="Flight Distance By Carriers", outliers=F)%>%
hc_add_series_labels_values(DisCarrier$V2, DisCarrier$sumDis, type='pie', name = "Total Distance in km",
colorByPoint = TRUE, center = c('22%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))
This figure (outliers in the boxplot are actually skipped) reveals the diversity of each airline with respect to the flight distance. It can been seen that:
How busy are these carriers? Let’s find out the flights of each day:
FlightsDaily=count(FlightsFN, V2, date=FL_DATE);
FlightsDaily$date=as.Date(FlightsDaily$date);
highchart() %>%
hc_title(text='Daily US Flight Count of Each Carrier In Dec, 2015') %>%
hc_xAxis(categories=FlightsDaily$date) %>%
hc_add_series_df(FlightsDaily, y=n, type="line", group=V2) %>%
hc_legend(layout = "vertical", align = "right" )%>%
hc_add_series_labels_values(CarrierCountFN$V2, CarrierCountFN$n, type = "pie",
name = "Flight Count in Dec, 2015", colorByPoint = TRUE, center = c('110%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))
We can conclude that:
Distinctive Flights
Let’s see how many distinctive flight routes from all airlines. We assume that the flights operated by the same carrier, with the identical flight number, from the same origin city to the same destination, are regarded as the same flight. (This may violate the situtation that two carriers share the FL_NUM. Practically, the flight of this kind is the same flight. But under our assumption, They are two flights.)
Some flights are operated daily, but others weekly. The flight frequency is as shown in the following figure.
DistinctFlight=count(Flights, UNIQUE_CARRIER, FL_NUM, ORIGIN_CITY_NAME, DEST_CITY_NAME);
FlightsByCarriers=count(DistinctFlight[1], UNIQUE_CARRIER);
DistinctFlightCount=count(DistinctFlight[5], n);
highchart() %>%
hc_title(text='Distinctive US Flight Count Distribution in Dec, 2015')%>%
hc_add_series_labels_values(DistinctFlightCount$n, DistinctFlightCount$nn, name = "Flight Count",
colorByPoint =F, type = "column") %>%
hc_add_series_labels_values(FlightsByCarriers$UNIQUE_CARRIER, FlightsByCarriers$n, type = "pie",
name = "Distinctive Flights Count", colorByPoint = TRUE, center = c('25%', '25%'),
size = 200, dataLabels = list(enabled = FALSE))
There are 44416 distinctive flights operated in Dec, 2015, of which:
Cancellation Rate
FlightsCancelled=ddply(FlightsFN, .(V2,FL_DATE), summarise,cancelledNo=sum(CANCELLED));
FlightsCancelled$FL_DATE=as.Date(FlightsCancelled$FL_DATE);
FlightsCancelledData=merge(FlightsCancelled, FlightsDaily, by.x=c('V2', 'FL_DATE'), by.y=c('V2','date'));
FlightsCancelledData=mutate(FlightsCancelledData, CancelRate=cancelledNo/n);
CancelContribution=ddply(FlightsCancelledData, .(V2), summarise,EachSum=sum(cancelledNo), Total=sum(n));
highchart() %>%
hc_xAxis(categories=FlightsCancelledData$FL_DATE) %>%
hc_add_series_df(data=FlightsCancelledData, y=CancelRate, type="line", group = V2) %>%
hc_add_series_labels_values(CancelContribution$V2, CancelContribution$EachSum, type = "pie",
name = "Cancelled Flights Contribution", colorByPoint = TRUE, center = c('112%', '25%'),
size = 200, dataLabels = list(enabled = FALSE)) %>%
hc_legend(layout = "vertical", align = "right") %>%
hc_title(text='Flight Cancellation of Each Carrier in Dec, 2015')
From this figure, we can see:
Inter-City Conveniency
To see which combinitation of two cities are provided with most flights, we find out all cities linked by distinctive flights.
CityFlight=count(DistinctFlight[3:4], ORIGIN_CITY_NAME, DEST_CITY_NAME);
There are 3803 city combinations served by all flights. These flights cover 303 cities as origins and 303 cities as destinations.
library(httr);
library(geojsonio);
map <- "https://raw.githubusercontent.com/johan/world.geo.json/master/countries/USA.geo.json" %>%
GET() %>%
content() %>%
jsonlite::fromJSON(simplifyVector = FALSE);
airports <- read.csv("C:/Users/weibo/Desktop/Data Scientist at Spotify/airports.csv", stringsAsFactors=FALSE);
OriCity=count(Flights, ORIGIN);
OriCity=merge(OriCity, airports[c(1,6,7)], by.x='ORIGIN', by.y='iata');
OriCity=unique(merge(OriCity,Flights[8:9], by='ORIGIN'));
airpjsonOri <- geojson_json(OriCity, lat = "lat", lon = "long");
highchart(type='map') %>%
hc_title(text = "Airports Scattered in US") %>%
hc_add_series(mapData = map, showInLegend = F, nullColor = "#A9CF54", borderColor="#A9CF54") %>%
hc_add_series(data=airpjsonOri, type = "mappoint", dataLabels = list(enabled = FALSE),
name = "Airports", color = '#4F4F4F',
tooltip = list(pointFormat = "{point.properties.ORIGIN_CITY_NAME}:{point.properties.n} depature flights")) %>%
hc_mapNavigation(enabled = TRUE)
These 307 airports scatter in the 303 cities of mainland, Alaska, Hawaii and oversea territories, including Guam, Virgin Islands and Puerto Rico.
So obviously, some cities has more than one airports. See this table:
airpC=count(FlightsFN, city=ORIGIN_CITY_NAME, airport=ORIGIN);
datatable(airpC)
From this table, we can find that:
Transfering Capacity
We can explore the departure and arrival flights for each airport. And also, D/A imbalance rate can be defined as the result of departure flight count devided by arrival flight count.
OriCity=count(Flights, ORIGIN);
OriCity=unique(merge(OriCity,Flights[8:9], by='ORIGIN'));
DesCity=count(Flights, DEST);
DesCity=unique(merge(DesCity,Flights[13:14], by='DEST'));
EndCity=merge(OriCity,DesCity, by.x='ORIGIN', by.y='DEST')
highchart() %>%
hc_xAxis(categories=EndCity$ORIGIN)%>%
hc_yAxis_multiples(
list(title = list(text = "D/A Imbalance Rate"), align = "right", showFirstLabel = FALSE),
list(title = list(text = "Distinctive Flight Count"), align = "left",showFirstLabel = FALSE,opposite = TRUE)) %>%
hc_add_series(data=EndCity$n.x, colorByPoint = F, name = "Departure Flights", type='column', yAxis=1)%>%
hc_add_series(data=EndCity$n.y, colorByPoint = F, name = "Arrival Flights", type='column', yAxis=1) %>%
hc_add_series(data=EndCity$n.x/EndCity$n.y, name = "Imbalance Rate", type='line') %>%
hc_title(text='Distinctive Flights Count and Imbalance Rate for Each US Airport')%>%
hc_tooltip(valueDecimals=4)
As shown above:
Busiest Day of Each Airport
library(viridisLite);
AirpOriDate=count(Flights, FL_DATE, ORIGIN);
AirpDesDate=count(Flights, FL_DATE, DEST);
AirpDate=merge(AirpOriDate, AirpDesDate, by.x=c('ORIGIN', 'FL_DATE'), by.y=c('DEST', 'FL_DATE'));
AirpDate=mutate(AirpDate, throughput=n.x+n.y);
stpscol <- color_stops(10, viridis(20));
fntltp <- JS("function(){
return this.point.ORIGIN + ' in ' + this.series.yAxis.categories[this.point.y] + ':<br>' +
Highcharts.numberFormat(this.point.value, 0) + ' flights';}");
hchart(AirpDate, "heatmap", y = FL_DATE, x = ORIGIN, value = throughput, name='Airport Flight Throughput') %>%
hc_colorAxis(stops = stpscol, type = "logarithmic") %>%
hc_title(text = "Daily Throughput of Each Airport in Dec, 2015") %>%
hc_tooltip(formatter = fntltp) %>%
hc_legend(layout = "vertical", verticalAlign = "bottom", align = "right")
This figure gives the holistic idea about how many flights are operated by each airport everyday.
We can construct the adjacency matrix for each pair of two cities. It shoud be a 303 * 303 matrix. Due to the flight route is directed, ence in the matrix, 0 value indicates that there is no flight from ORIGIN to DEST, and 1 value for at least one flight from ORIGIN to DEST. That is, the matrix is not necessarily symmetric.
CityList=intersect(CityFlight$ORIGIN_CITY_NAME,CityFlight$DEST_CITY_NAME);
AdjMatrix=matrix(0, nrow=length(unique(CityFlight$ORIGIN_CITY_NAME)), ncol=length(unique(CityFlight$DEST_CITY_NAME)));
colnames(AdjMatrix)=c(CityList);
rownames(AdjMatrix)=c(CityList);
pb=txtProgressBar(min=0,max=1,style=3)
##
|
| | 0%
for (i in 1 : nrow(AdjMatrix)){
for (j in 1: ncol(AdjMatrix)){
if (is.element(CityList[j], CityFlight$DEST_CITY_NAME[CityFlight[,1]==CityList[i]])==T) {AdjMatrix[i, j]=1}
else {AdjMatrix[i, j]=0}}
setTxtProgressBar(pb,value=i/nrow(AdjMatrix))};
##
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
fntltp <- JS("function(){return 'DEST: ' + this.point.name + ' as ORIGIN: ' + this.point.value}");
hchart(AdjMatrix) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by Flight')
This data presents the possibility of travelling between two US cities with no-transfer flights. As shown above, existing flights covers 0.0414 of all no-stop flight requirements. If one transfer is allowed, let’s see how many cities will be connected. To do this, we employ the reachability theroy.
AdjMatrix1Trans=AdjMatrix%*%AdjMatrix;
AdjMatrix1Transfer=AdjMatrix+AdjMatrix1Trans;
for (i in 1: nrow(AdjMatrix1Transfer)){
for (j in 1: ncol(AdjMatrix1Transfer)){
if (AdjMatrix1Transfer[i,j]>0){AdjMatrix1Transfer[i,j]=1}
else {AdjMatrix1Transfer[i,j]=AdjMatrix1Transfer[i,j]}}}
hchart(AdjMatrix1Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by One-Transfer Flight')
These flights cover 0.6 of all flights requirements between two arbitrary cities with at most one transfer.
Let’s make two transfers.
AdjMatrix2Trans=AdjMatrix1Trans%*%AdjMatrix;
AdjMatrix2Transfer=AdjMatrix1Transfer+AdjMatrix2Trans;
for (i in 1: nrow(AdjMatrix2Transfer)){
for (j in 1: ncol(AdjMatrix2Transfer)){
if (AdjMatrix2Transfer[i,j]>0){AdjMatrix2Transfer[i,j]=1}
else {AdjMatrix2Transfer[i,j]=AdjMatrix2Transfer[i,j]}}}
hchart(AdjMatrix2Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two US Cities by Two-Transfer Flight')
8.821110^{4} fligts with at most two transfers are provided. 0.9608 of all requirements are met. There are still some cities out of flight reach. Three transfers may make all cities connected in dual directions.
AdjMatrix3Trans=AdjMatrix2Trans%*%AdjMatrix;
AdjMatrix3Transfer=AdjMatrix2Transfer+AdjMatrix3Trans;
for (i in 1: nrow(AdjMatrix3Transfer)){
for (j in 1: ncol(AdjMatrix3Transfer)){
if (AdjMatrix3Transfer[i,j]>0){AdjMatrix3Transfer[i,j]=1}
else {AdjMatrix3Transfer[i,j]=AdjMatrix3Transfer[i,j]}}}
hchart(AdjMatrix3Transfer) %>%
hc_legend(enabled=F) %>%
hc_tooltip(formatter=fntltp) %>%
hc_title(text='Connectedness of Two Cities by Three-Transfer Flight')