This script reads in the analytic data for the EHAM use case. The pre-processing of the data is described in ProessingCode.Rmd.
# settings
dataFolder <- "data"
dataFile <- "data_2013.csv"
setwd("D:/my documents/Msc/Year 3/ATRN704/CASE STUDY/R-files")
# read-in data
data <- read.csv(file.path(dataFolder, dataFile), header = TRUE)
# inspect data
str(data)
## 'data.frame': 439024 obs. of 15 variables:
## $ AP_C_FLTID : Factor w/ 9180 levels "3BRGT","4XAOO",..: 9177 1120 1106 6943 3907 3915 4195 7028 1382 1358 ...
## $ AP_C_REG : Factor w/ 4898 levels "","10076","144618",..: 4069 3915 3916 4085 4017 3812 3846 4088 3289 3271 ...
## $ ADEP_ICAO : Factor w/ 726 levels "","AGGL","BGSF",..: 164 263 293 266 615 627 703 164 339 326 ...
## $ ADES_ICAO : Factor w/ 683 levels "","BIKF","BIRK",..: 154 154 154 154 154 154 154 537 154 154 ...
## $ MVT_TIME_UTC : Factor w/ 436106 levels "2013/01/01 00:42:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ BLOCK_TIME_UTC : Factor w/ 433381 levels "2013/01/01 00:44:00",..: 1 2 3 4 5 7 8 6 9 10 ...
## $ SCHED_TIME_UTC : Factor w/ 88139 levels "2013/01/01 00:42:21",..: 1 2 5 3 7 8 9 6 11 11 ...
## $ AP_C_FLTRUL : Factor w/ 3 levels "","IFR","VFR": 1 1 2 1 2 2 2 2 2 2 ...
## $ AP_C_FLTTYP : Factor w/ 5 levels "G","M","N","S",..: 5 3 3 3 4 4 4 3 4 4 ...
## $ ARCTYP : Factor w/ 190 levels "","A124","A139",..: 3 39 39 39 154 11 42 39 12 12 ...
## $ AC_CLASS : Factor w/ 8 levels "H","HEL","LJ",..: 2 6 6 6 1 1 1 6 1 1 ...
## $ AP_C_RWY : Factor w/ 13 levels "04","06","09",..: 13 6 6 6 6 6 6 8 6 6 ...
## $ AP_C_STND : Factor w/ 237 levels "A31","A32","A33",..: 148 80 81 75 121 125 124 133 110 111 ...
## $ SRC_PHASE : Factor w/ 2 levels "ARR","DEP": 1 1 1 1 1 1 1 2 1 1 ...
## $ PREV_BLOCK_TIME_UTC: Factor w/ 195451 levels "","2013/01/01 03:54:32",..: 1 1 1 1 1 3 1 2 1 1 ...
# coerce times to timestamps
data$MVT_TIME_UTC <- as.POSIXlt(data$MVT_TIME_UTC, tz = "GMT")
data$BLOCK_TIME_UTC <- as.POSIXlt(data$BLOCK_TIME_UTC, tz = "GMT")
data$SCHED_TIME_UTC <- as.POSIXlt(data$SCHED_TIME_UTC, tz = "GMT")
# coerce PREV_BLOCK_TIME
isEmpty <- data$PREV_BLOCK_TIME_UTC == "" # logical vector
data$PREV_BLOCK_TIME_UTC[isEmpty] <- NA # set empty times to NA
data$PREV_BLOCK_TIME_UTC <- as.POSIXlt(data$PREV_BLOCK_TIME_UTC, tz = "GMT")
Block delay. We define the block-delay as the difference between the scheduled block time and the actual block time. This equally applies for arrivals and departures. E.g for arrivals: SCHED_TIME_UTC = SIBT for arrival and SOBT for departure (slot time) BLOCK_TIME_UTC = AIBT for arrival and AOBT for departure
AIBT - SIBT > 0 is a late arrival; AIBT - SIBT <= 0 an early arrival.
# block delay (in minutes)
data$BLOCK_DLY <- (data$BLOCK_TIME_UTC - data$SCHED_TIME_UTC)/60
The Airline Identifier is to be extracted from the flight number AP_C_FLTID. It is assumed that the 3st three charaters are descriptive for the Airline name.
# define airline ID
data$Airline_ID <- substr(data$AP_C_FLTID,1,3)
##
Scope the dataset to a certain time period (Summer season 2013)
# a few days of traffic
start <- as.POSIXlt("2013-04-01 00:00:00", tz = "GMT")
end <- as.POSIXlt("2013-10-31 23:59:59", tz = "GMT")
trimmed <- data[data$MVT_TIME_UTC >= start & data$MVT_TIME_UTC <= end,]
For the next chunk we re-use the pairs subset (run chunks from above to have pairs dataframe generated). The extract the EHAM - xxxx city pair
city <- c("EGLL","EDDF","LFPG","LEMD")
sourcephase <- c("DEP","ARR")
# logical vector for subsetting
isOutboundTo <- trimmed$ADES_ICAO %in% city
isInboundFm <- trimmed$ADEP_ICAO %in% city
EHAM_outbound <- trimmed[isOutboundTo,]
EHAM_inbound <- trimmed[isInboundFm,]
table(EHAM_outbound$SRC_PHASE)
##
## ARR DEP
## 0 10916
table(EHAM_outbound$Airline_ID)
##
## 8PM 9AD ABR AEA AFR AIZ AMX ANE BAH BAW BCS CGP CKK CSN DCE
## 1 1 2 428 1242 1 1 1 1 1633 23 1 1 1 1
## DIM DIN DLH ELY ENT EXU FDX GES HRN IBE IBS JEI JNL KLM LNX
## 1 1 1475 1 1 1 120 3 1 1 2 2 2 5914 1
## LXJ N24 N47 N50 N52 N53 N76 N80 NJE PGT PHI PLM RAM RJD SLM
## 1 3 1 1 1 1 2 1 8 1 1 1 1 1 3
## SVW SWT TCL TFL TGM TRA URI VHM VIR VJS XAF
## 1 2 1 11 3 2 1 1 1 1 1
Below statistics are determined for all airlines, as well as per individual airline
for (i in 1:4)
{
hist(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]),
freq=NULL,
right=TRUE,
col="red",
plot=TRUE,
labels=TRUE,
main = paste("EHAM Departure Slot Variability to ",city[i],
"\n Mean",mean(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ])),
"\n SD" ,sd(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]))
),
xlab="Block Delay\n[Pos. is late]",
breaks=150,
xlim=range(-50,50))
hist(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]),
freq=NULL,
right=TRUE,
col="blue",
plot=TRUE,
labels=TRUE,
main = paste("EHAM Arrival Slot Variability from ", city[i],
"\n Mean",mean(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])),
"\n SD",sd(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])) ),
xlab="Block Delay [Pos. is late]",
breaks=150,
xlim=range(-50,50))
# plot block delays
plot(EHAM_outbound$SCHED_TIME_UTC, EHAM_outbound$BLOCK_DLY,
type = "n", # blank plot for range / dimensions
main = paste("EHAM Departure Slot Variability to ", city[i]),
xlab = "Date", # a nice x-axis label
ylim = range(-75,75)
)
points(subset(EHAM_outbound,
ADES_ICAO == city[i],
select = c(BLOCK_TIME_UTC, BLOCK_DLY)
),
col = "red"
)
# plot block delays
plot(EHAM_inbound$SCHED_TIME_UTC, EHAM_inbound$BLOCK_DLY,
type = "n", # blank plot for range / dimensions
main = paste("EHAM Arrival Slot Variability from ", city[i]),
xlab = "Date", # a nice x-axis label
ylim = range(-75,75)
)
points(subset(EHAM_inbound,
ADEP_ICAO == city[i],
select = c(BLOCK_TIME_UTC, BLOCK_DLY)
),
col = "blue"
)
}
Sys.info()
## sysname release
## "Windows" "XP"
## version nodename
## "build 2600, Service Pack 3" "KLM007683"
## machine login
## "x86" "klm35521"
## user effective_user
## "klm35521" "klm35521"