Study

Data Preparation

This script reads in the analytic data for the EHAM use case. The pre-processing of the data is described in ProessingCode.Rmd.

# settings
dataFolder <- "data"
dataFile <- "data_2013.csv"
setwd("D:/my documents/Msc/Year 3/ATRN704/CASE STUDY/R-files")

# read-in data
data <- read.csv(file.path(dataFolder, dataFile), header = TRUE)

Basic Data Cleaning

# inspect data
str(data)
## 'data.frame':    439024 obs. of  15 variables:
##  $ AP_C_FLTID         : Factor w/ 9180 levels "3BRGT","4XAOO",..: 9177 1120 1106 6943 3907 3915 4195 7028 1382 1358 ...
##  $ AP_C_REG           : Factor w/ 4898 levels "","10076","144618",..: 4069 3915 3916 4085 4017 3812 3846 4088 3289 3271 ...
##  $ ADEP_ICAO          : Factor w/ 726 levels "","AGGL","BGSF",..: 164 263 293 266 615 627 703 164 339 326 ...
##  $ ADES_ICAO          : Factor w/ 683 levels "","BIKF","BIRK",..: 154 154 154 154 154 154 154 537 154 154 ...
##  $ MVT_TIME_UTC       : Factor w/ 436106 levels "2013/01/01 00:42:00",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ BLOCK_TIME_UTC     : Factor w/ 433381 levels "2013/01/01 00:44:00",..: 1 2 3 4 5 7 8 6 9 10 ...
##  $ SCHED_TIME_UTC     : Factor w/ 88139 levels "2013/01/01 00:42:21",..: 1 2 5 3 7 8 9 6 11 11 ...
##  $ AP_C_FLTRUL        : Factor w/ 3 levels "","IFR","VFR": 1 1 2 1 2 2 2 2 2 2 ...
##  $ AP_C_FLTTYP        : Factor w/ 5 levels "G","M","N","S",..: 5 3 3 3 4 4 4 3 4 4 ...
##  $ ARCTYP             : Factor w/ 190 levels "","A124","A139",..: 3 39 39 39 154 11 42 39 12 12 ...
##  $ AC_CLASS           : Factor w/ 8 levels "H","HEL","LJ",..: 2 6 6 6 1 1 1 6 1 1 ...
##  $ AP_C_RWY           : Factor w/ 13 levels "04","06","09",..: 13 6 6 6 6 6 6 8 6 6 ...
##  $ AP_C_STND          : Factor w/ 237 levels "A31","A32","A33",..: 148 80 81 75 121 125 124 133 110 111 ...
##  $ SRC_PHASE          : Factor w/ 2 levels "ARR","DEP": 1 1 1 1 1 1 1 2 1 1 ...
##  $ PREV_BLOCK_TIME_UTC: Factor w/ 195451 levels "","2013/01/01 03:54:32",..: 1 1 1 1 1 3 1 2 1 1 ...
# coerce times to timestamps
data$MVT_TIME_UTC <- as.POSIXlt(data$MVT_TIME_UTC, tz = "GMT")
data$BLOCK_TIME_UTC <- as.POSIXlt(data$BLOCK_TIME_UTC, tz = "GMT")
data$SCHED_TIME_UTC <- as.POSIXlt(data$SCHED_TIME_UTC, tz = "GMT")

# coerce PREV_BLOCK_TIME
isEmpty <- data$PREV_BLOCK_TIME_UTC == "" # logical vector
data$PREV_BLOCK_TIME_UTC[isEmpty] <- NA   # set empty times to NA
data$PREV_BLOCK_TIME_UTC <- as.POSIXlt(data$PREV_BLOCK_TIME_UTC, tz = "GMT")

Preparatory Variable Calculation

Block delay. We define the block-delay as the difference between the scheduled block time and the actual block time. This equally applies for arrivals and departures. E.g for arrivals: SCHED_TIME_UTC = SIBT for arrival and SOBT for departure (slot time) BLOCK_TIME_UTC = AIBT for arrival and AOBT for departure

AIBT - SIBT > 0 is a late arrival; AIBT - SIBT <= 0 an early arrival.

# block delay (in minutes)
data$BLOCK_DLY <- (data$BLOCK_TIME_UTC - data$SCHED_TIME_UTC)/60

Create an Airline Identifier

The Airline Identifier is to be extracted from the flight number AP_C_FLTID. It is assumed that the 3st three charaters are descriptive for the Airline name.

# define airline ID

data$Airline_ID <- substr(data$AP_C_FLTID,1,3)

## 

Trimmed dataset

Scope the dataset to a certain time period (Summer season 2013)

# a few days of traffic
start <- as.POSIXlt("2013-04-01 00:00:00", tz = "GMT")
end   <- as.POSIXlt("2013-10-31 23:59:59", tz = "GMT")

trimmed <- data[data$MVT_TIME_UTC >= start & data$MVT_TIME_UTC <= end,]

Selection of the Airport pair - Intercontinental Hubs

For the next chunk we re-use the pairs subset (run chunks from above to have pairs dataframe generated). The extract the EHAM - xxxx city pair

city   <- c("EGLL","EDDF","LFPG","LEMD")

sourcephase <- c("DEP","ARR")

# logical vector for subsetting
isOutboundTo <- trimmed$ADES_ICAO %in% city
isInboundFm  <- trimmed$ADEP_ICAO %in% city

EHAM_outbound <- trimmed[isOutboundTo,]
EHAM_inbound <- trimmed[isInboundFm,]

table(EHAM_outbound$SRC_PHASE)
## 
##   ARR   DEP 
##     0 10916
table(EHAM_outbound$Airline_ID)
## 
##  8PM  9AD  ABR  AEA  AFR  AIZ  AMX  ANE  BAH  BAW  BCS  CGP  CKK  CSN  DCE 
##    1    1    2  428 1242    1    1    1    1 1633   23    1    1    1    1 
##  DIM  DIN  DLH  ELY  ENT  EXU  FDX  GES  HRN  IBE  IBS  JEI  JNL  KLM  LNX 
##    1    1 1475    1    1    1  120    3    1    1    2    2    2 5914    1 
##  LXJ  N24  N47  N50  N52  N53  N76  N80  NJE  PGT  PHI  PLM  RAM  RJD  SLM 
##    1    3    1    1    1    1    2    1    8    1    1    1    1    1    3 
##  SVW  SWT  TCL  TFL  TGM  TRA  URI  VHM  VIR  VJS  XAF 
##    1    2    1   11    3    2    1    1    1    1    1

Visualize the statistics

Below statistics are determined for all airlines, as well as per individual airline

for (i in 1:4)
  {


  hist(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]),
     freq=NULL,
     right=TRUE,
     col="red",
     plot=TRUE,
     labels=TRUE,
     main = paste("EHAM Departure Slot Variability to ",city[i],
                 "\n Mean",mean(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ])),
                 "\n SD"  ,sd(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]))
                 ),
     xlab="Block Delay\n[Pos. is late]",
     breaks=150,
     xlim=range(-50,50))

  hist(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]),
     freq=NULL,
     right=TRUE,
     col="blue",
     plot=TRUE,
     labels=TRUE,
     main = paste("EHAM Arrival Slot Variability from ", city[i],
                  "\n Mean",mean(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])),
                  "\n SD",sd(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]))                  ),
     xlab="Block Delay [Pos. is late]",
     breaks=150,
     xlim=range(-50,50))

  # plot block delays
  plot(EHAM_outbound$SCHED_TIME_UTC, EHAM_outbound$BLOCK_DLY, 
     type = "n",                   # blank plot for range / dimensions
     main = paste("EHAM Departure Slot Variability to ", city[i]),
     xlab = "Date",    # a nice x-axis label
     ylim = range(-75,75)
     )

    points(subset(EHAM_outbound, 
              ADES_ICAO == city[i],
                            select = c(BLOCK_TIME_UTC, BLOCK_DLY)
              ),
       col = "red"
       )

  # plot block delays
  plot(EHAM_inbound$SCHED_TIME_UTC, EHAM_inbound$BLOCK_DLY, 
     type = "n",                   # blank plot for range / dimensions
     main = paste("EHAM Arrival Slot Variability from ", city[i]),
     xlab = "Date",    # a nice x-axis label
     ylim = range(-75,75)
     )

    points(subset(EHAM_inbound, 
              ADEP_ICAO == city[i],
                            select = c(BLOCK_TIME_UTC, BLOCK_DLY)
              ),
       col = "blue"
       )


}

plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats

Appendices

System Information

Sys.info()
##                      sysname                      release 
##                    "Windows"                         "XP" 
##                      version                     nodename 
## "build 2600, Service Pack 3"                  "KLM007683" 
##                      machine                        login 
##                        "x86"                   "klm35521" 
##                         user               effective_user 
##                   "klm35521"                   "klm35521"