This script reads in the analytic data for the EHAM use case. The pre-processing of the data is described in ProessingCode.Rmd.
# settings
dataFolder <- "data"
dataFile <- "data_2013.csv"
setwd("D:/my documents/Msc/Year 3/ATRN704/CASE STUDY/R-files")
# read-in data
data <- read.csv(file.path(dataFolder, dataFile), header = TRUE)
# inspect data
str(data)
## 'data.frame': 439024 obs. of 15 variables:
## $ AP_C_FLTID : Factor w/ 9180 levels "3BRGT","4XAOO",..: 9177 1120 1106 6943 3907 3915 4195 7028 1382 1358 ...
## $ AP_C_REG : Factor w/ 4898 levels "","10076","144618",..: 4069 3915 3916 4085 4017 3812 3846 4088 3289 3271 ...
## $ ADEP_ICAO : Factor w/ 726 levels "","AGGL","BGSF",..: 164 263 293 266 615 627 703 164 339 326 ...
## $ ADES_ICAO : Factor w/ 683 levels "","BIKF","BIRK",..: 154 154 154 154 154 154 154 537 154 154 ...
## $ MVT_TIME_UTC : Factor w/ 436106 levels "2013/01/01 00:42:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ BLOCK_TIME_UTC : Factor w/ 433381 levels "2013/01/01 00:44:00",..: 1 2 3 4 5 7 8 6 9 10 ...
## $ SCHED_TIME_UTC : Factor w/ 88139 levels "2013/01/01 00:42:21",..: 1 2 5 3 7 8 9 6 11 11 ...
## $ AP_C_FLTRUL : Factor w/ 3 levels "","IFR","VFR": 1 1 2 1 2 2 2 2 2 2 ...
## $ AP_C_FLTTYP : Factor w/ 5 levels "G","M","N","S",..: 5 3 3 3 4 4 4 3 4 4 ...
## $ ARCTYP : Factor w/ 190 levels "","A124","A139",..: 3 39 39 39 154 11 42 39 12 12 ...
## $ AC_CLASS : Factor w/ 8 levels "H","HEL","LJ",..: 2 6 6 6 1 1 1 6 1 1 ...
## $ AP_C_RWY : Factor w/ 13 levels "04","06","09",..: 13 6 6 6 6 6 6 8 6 6 ...
## $ AP_C_STND : Factor w/ 237 levels "A31","A32","A33",..: 148 80 81 75 121 125 124 133 110 111 ...
## $ SRC_PHASE : Factor w/ 2 levels "ARR","DEP": 1 1 1 1 1 1 1 2 1 1 ...
## $ PREV_BLOCK_TIME_UTC: Factor w/ 195451 levels "","2013/01/01 03:54:32",..: 1 1 1 1 1 3 1 2 1 1 ...
# coerce times to timestamps
data$MVT_TIME_UTC <- as.POSIXlt(data$MVT_TIME_UTC, tz = "GMT")
data$BLOCK_TIME_UTC <- as.POSIXlt(data$BLOCK_TIME_UTC, tz = "GMT")
data$SCHED_TIME_UTC <- as.POSIXlt(data$SCHED_TIME_UTC, tz = "GMT")
# coerce PREV_BLOCK_TIME
isEmpty <- data$PREV_BLOCK_TIME_UTC == "" # logical vector
data$PREV_BLOCK_TIME_UTC[isEmpty] <- NA # set empty times to NA
data$PREV_BLOCK_TIME_UTC <- as.POSIXlt(data$PREV_BLOCK_TIME_UTC, tz = "GMT")
Block delay. We define the block-delay as the difference between the scheduled block time and the actual block time. This equally applies for arrivals and departures. E.g for arrivals: SCHED_TIME_UTC = SIBT for arrival and SOBT for departure (slot time) BLOCK_TIME_UTC = AIBT for arrival and AOBT for departure
AIBT - SIBT > 0 is a late arrival; AIBT - SIBT <= 0 an early arrival.
# block delay (in minutes)
data$BLOCK_DLY <- (data$BLOCK_TIME_UTC - data$SCHED_TIME_UTC)/60
The Airline Identifier is to be extracted from the flight number AP_C_FLTID. It is assumed that the 3st three charaters are descriptive for the Airline name.
# define airline ID
data$Airline_ID <- substr(data$AP_C_FLTID,1,3)
##
Scope the dataset to a certain time period (Summer season 2013)
# a few days of traffic
start <- as.POSIXlt("2013-04-01 00:00:00", tz = "GMT")
end <- as.POSIXlt("2013-10-31 23:59:59", tz = "GMT")
trimmed <- data[data$MVT_TIME_UTC >= start & data$MVT_TIME_UTC <= end,]
For the next chunk we re-use the pairs subset (run chunks from above to have pairs dataframe generated). The extract the EHAM - xxxx city pair
city <- c("EDDB","EDDM","LTBA","LIRF","LEBL","LSZH","LOWW","EGKK","EKCH","ENGM","EBBR","ESSA","LIMC","EGCC","LPPT")
sourcephase <- c("DEP","ARR")
# logical vector for subsetting
isOutboundTo <- trimmed$ADES_ICAO %in% city
isInboundFm <- trimmed$ADEP_ICAO %in% city
EHAM_outbound <- trimmed[isOutboundTo,]
EHAM_inbound <- trimmed[isInboundFm,]
table(EHAM_outbound$SRC_PHASE)
##
## ARR DEP
## 0 25770
table(EHAM_outbound$Airline_ID)
##
## AAB ABW ADN AHO AJU AOJ ATL AUA AUF AWC AZA BAW
## 5 4 4 5 1 4 1 853 1 1 642 667
## BCY BEE BEL BGM BID BLJ BMW BOO BPA CAI CAZ CCS
## 1 1 1 1 2 1 2 2 2 6 1 1
## CGS CGX CND CSD CTM DAF DAJ DAR DCA DHK DIS DLH
## 1 1 2 1 1 1 2 1 1 1 2 1311
## EDC EFD ENT EXS EXU EZY FDX FHY FLI FPO FTY FYG
## 1 3 3 3 1 3257 52 1 1 1 1 2
## GAC GDK GHC GMI GSJ GSX GXX HAY HBJ HBL HBV HRN
## 3 1 4 1 1 1 1 1 2 1 2 2
## HTM HYR IBS ICN ING JAF JAR JEI JNL KLM LMJ LNX
## 3 1 1 1 1 20 1 2 19 12537 1 2
## LXA LXJ LZB MAS MAY MBT MDT MGO MHV MJJ MMD MNB
## 1 2 1 1 1 2 1 1 1 1 1 44
## MND MON N12 N24 N25 N28 N32 N35 N40 N50 N51 N52
## 1 6 1 7 1 1 1 1 1 1 1 1
## N59 N71 N80 N88 N92 NAF NAX NCA NJE NLV NLY NOS
## 1 2 1 2 1 1 553 2 36 1 7 2
## OBS OEF OEH OHY OOA OOC OOE OOS OPJ P4B PAN PAV
## 1 3 1 9 7 1 1 1 1 1 1 2
## PGT PHA PHI PHM PHS PJS PNC PRD PVG QGA QTR RAM
## 4 1 2 3 1 1 1 1 2 3 1 1
## RBB REN RRR RZO SAS SAZ SLM SSR SSZ STQ SWR SXN
## 1 1 1 2 1462 1 3 1 1 1 854 1
## TAP TCC TCK TCS TCW TCX TFL THY TRA TUI TVS VCG
## 626 1 2 1 4 3 22 862 896 1 3 1
## VHM VJS VKG VLG VM9 VPB VPC VVV WHT XAB XLF XPE
## 3 1 2 792 1 1 2 1 3 1 2 2
Below statistics are determined for all airlines, as well as per individual airline
for (i in 1:15)
{
hist(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]),
freq=NULL,
right=TRUE,
col="red",
plot=TRUE,
labels=TRUE,
main = paste("EHAM Departure Slot Variability to ",city[i],
"\n Mean",mean(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ])),
"\n SD" ,sd(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]))
),
xlab="Block Delay\n[Pos. is late]",
breaks=150,
xlim=range(-50,50))
hist(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]),
freq=NULL,
right=TRUE,
col="blue",
plot=TRUE,
labels=TRUE,
main = paste("EHAM Arrival Slot Variability from ", city[i],
"\n Mean",mean(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])),
"\n SD",sd(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])) ),
xlab="Block Delay [Pos. is late]",
breaks=150,
xlim=range(-50,50))
# plot block delays
plot(EHAM_outbound$SCHED_TIME_UTC, EHAM_outbound$BLOCK_DLY,
type = "n", # blank plot for range / dimensions
main = paste("EHAM Departure Slot Variability to ", city[i]),
xlab = "Date", # a nice x-axis label
ylim = range(-75,75)
)
points(subset(EHAM_outbound,
ADES_ICAO == city[i],
select = c(BLOCK_TIME_UTC, BLOCK_DLY)
),
col = "red"
)
# plot block delays
plot(EHAM_inbound$SCHED_TIME_UTC, EHAM_inbound$BLOCK_DLY,
type = "n", # blank plot for range / dimensions
main = paste("EHAM Arrival Slot Variability from ", city[i]),
xlab = "Date", # a nice x-axis label
ylim = range(-75,75)
)
points(subset(EHAM_inbound,
ADEP_ICAO == city[i],
select = c(BLOCK_TIME_UTC, BLOCK_DLY)
),
col = "blue"
)
}
Sys.info()
## sysname release
## "Windows" "XP"
## version nodename
## "build 2600, Service Pack 3" "KLM007683"
## machine login
## "x86" "klm35521"
## user effective_user
## "klm35521" "klm35521"