Study

Data Preparation

This script reads in the analytic data for the EHAM use case. The pre-processing of the data is described in ProessingCode.Rmd.

# settings
dataFolder <- "data"
dataFile <- "data_2013.csv"
setwd("D:/my documents/Msc/Year 3/ATRN704/CASE STUDY/R-files")

# read-in data
data <- read.csv(file.path(dataFolder, dataFile), header = TRUE)

Basic Data Cleaning

# inspect data
str(data)
## 'data.frame':    439024 obs. of  15 variables:
##  $ AP_C_FLTID         : Factor w/ 9180 levels "3BRGT","4XAOO",..: 9177 1120 1106 6943 3907 3915 4195 7028 1382 1358 ...
##  $ AP_C_REG           : Factor w/ 4898 levels "","10076","144618",..: 4069 3915 3916 4085 4017 3812 3846 4088 3289 3271 ...
##  $ ADEP_ICAO          : Factor w/ 726 levels "","AGGL","BGSF",..: 164 263 293 266 615 627 703 164 339 326 ...
##  $ ADES_ICAO          : Factor w/ 683 levels "","BIKF","BIRK",..: 154 154 154 154 154 154 154 537 154 154 ...
##  $ MVT_TIME_UTC       : Factor w/ 436106 levels "2013/01/01 00:42:00",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ BLOCK_TIME_UTC     : Factor w/ 433381 levels "2013/01/01 00:44:00",..: 1 2 3 4 5 7 8 6 9 10 ...
##  $ SCHED_TIME_UTC     : Factor w/ 88139 levels "2013/01/01 00:42:21",..: 1 2 5 3 7 8 9 6 11 11 ...
##  $ AP_C_FLTRUL        : Factor w/ 3 levels "","IFR","VFR": 1 1 2 1 2 2 2 2 2 2 ...
##  $ AP_C_FLTTYP        : Factor w/ 5 levels "G","M","N","S",..: 5 3 3 3 4 4 4 3 4 4 ...
##  $ ARCTYP             : Factor w/ 190 levels "","A124","A139",..: 3 39 39 39 154 11 42 39 12 12 ...
##  $ AC_CLASS           : Factor w/ 8 levels "H","HEL","LJ",..: 2 6 6 6 1 1 1 6 1 1 ...
##  $ AP_C_RWY           : Factor w/ 13 levels "04","06","09",..: 13 6 6 6 6 6 6 8 6 6 ...
##  $ AP_C_STND          : Factor w/ 237 levels "A31","A32","A33",..: 148 80 81 75 121 125 124 133 110 111 ...
##  $ SRC_PHASE          : Factor w/ 2 levels "ARR","DEP": 1 1 1 1 1 1 1 2 1 1 ...
##  $ PREV_BLOCK_TIME_UTC: Factor w/ 195451 levels "","2013/01/01 03:54:32",..: 1 1 1 1 1 3 1 2 1 1 ...
# coerce times to timestamps
data$MVT_TIME_UTC <- as.POSIXlt(data$MVT_TIME_UTC, tz = "GMT")
data$BLOCK_TIME_UTC <- as.POSIXlt(data$BLOCK_TIME_UTC, tz = "GMT")
data$SCHED_TIME_UTC <- as.POSIXlt(data$SCHED_TIME_UTC, tz = "GMT")

# coerce PREV_BLOCK_TIME
isEmpty <- data$PREV_BLOCK_TIME_UTC == "" # logical vector
data$PREV_BLOCK_TIME_UTC[isEmpty] <- NA   # set empty times to NA
data$PREV_BLOCK_TIME_UTC <- as.POSIXlt(data$PREV_BLOCK_TIME_UTC, tz = "GMT")

Preparatory Variable Calculation

Block delay. We define the block-delay as the difference between the scheduled block time and the actual block time. This equally applies for arrivals and departures. E.g for arrivals: SCHED_TIME_UTC = SIBT for arrival and SOBT for departure (slot time) BLOCK_TIME_UTC = AIBT for arrival and AOBT for departure

AIBT - SIBT > 0 is a late arrival; AIBT - SIBT <= 0 an early arrival.

# block delay (in minutes)
data$BLOCK_DLY <- (data$BLOCK_TIME_UTC - data$SCHED_TIME_UTC)/60

Create an Airline Identifier

The Airline Identifier is to be extracted from the flight number AP_C_FLTID. It is assumed that the 3st three charaters are descriptive for the Airline name.

# define airline ID

data$Airline_ID <- substr(data$AP_C_FLTID,1,3)

## 

Trimmed dataset

Scope the dataset to a certain time period (Summer season 2013)

# a few days of traffic
start <- as.POSIXlt("2013-04-01 00:00:00", tz = "GMT")
end   <- as.POSIXlt("2013-10-31 23:59:59", tz = "GMT")

trimmed <- data[data$MVT_TIME_UTC >= start & data$MVT_TIME_UTC <= end,]

Selection of the Airport pair - Intercontinental Hubs

For the next chunk we re-use the pairs subset (run chunks from above to have pairs dataframe generated). The extract the EHAM - xxxx city pair

city   <- c("EDDB","EDDM","LTBA","LIRF","LEBL","LSZH","LOWW","EGKK","EKCH","ENGM","EBBR","ESSA","LIMC","EGCC","LPPT")

sourcephase <- c("DEP","ARR")

# logical vector for subsetting
isOutboundTo <- trimmed$ADES_ICAO %in% city
isInboundFm  <- trimmed$ADEP_ICAO %in% city

EHAM_outbound <- trimmed[isOutboundTo,]
EHAM_inbound <- trimmed[isInboundFm,]

table(EHAM_outbound$SRC_PHASE)
## 
##   ARR   DEP 
##     0 25770
table(EHAM_outbound$Airline_ID)
## 
##   AAB   ABW   ADN   AHO   AJU   AOJ   ATL   AUA   AUF   AWC   AZA   BAW 
##     5     4     4     5     1     4     1   853     1     1   642   667 
##   BCY   BEE   BEL   BGM   BID   BLJ   BMW   BOO   BPA   CAI   CAZ   CCS 
##     1     1     1     1     2     1     2     2     2     6     1     1 
##   CGS   CGX   CND   CSD   CTM   DAF   DAJ   DAR   DCA   DHK   DIS   DLH 
##     1     1     2     1     1     1     2     1     1     1     2  1311 
##   EDC   EFD   ENT   EXS   EXU   EZY   FDX   FHY   FLI   FPO   FTY   FYG 
##     1     3     3     3     1  3257    52     1     1     1     1     2 
##   GAC   GDK   GHC   GMI   GSJ   GSX   GXX   HAY   HBJ   HBL   HBV   HRN 
##     3     1     4     1     1     1     1     1     2     1     2     2 
##   HTM   HYR   IBS   ICN   ING   JAF   JAR   JEI   JNL   KLM   LMJ   LNX 
##     3     1     1     1     1    20     1     2    19 12537     1     2 
##   LXA   LXJ   LZB   MAS   MAY   MBT   MDT   MGO   MHV   MJJ   MMD   MNB 
##     1     2     1     1     1     2     1     1     1     1     1    44 
##   MND   MON   N12   N24   N25   N28   N32   N35   N40   N50   N51   N52 
##     1     6     1     7     1     1     1     1     1     1     1     1 
##   N59   N71   N80   N88   N92   NAF   NAX   NCA   NJE   NLV   NLY   NOS 
##     1     2     1     2     1     1   553     2    36     1     7     2 
##   OBS   OEF   OEH   OHY   OOA   OOC   OOE   OOS   OPJ   P4B   PAN   PAV 
##     1     3     1     9     7     1     1     1     1     1     1     2 
##   PGT   PHA   PHI   PHM   PHS   PJS   PNC   PRD   PVG   QGA   QTR   RAM 
##     4     1     2     3     1     1     1     1     2     3     1     1 
##   RBB   REN   RRR   RZO   SAS   SAZ   SLM   SSR   SSZ   STQ   SWR   SXN 
##     1     1     1     2  1462     1     3     1     1     1   854     1 
##   TAP   TCC   TCK   TCS   TCW   TCX   TFL   THY   TRA   TUI   TVS   VCG 
##   626     1     2     1     4     3    22   862   896     1     3     1 
##   VHM   VJS   VKG   VLG   VM9   VPB   VPC   VVV   WHT   XAB   XLF   XPE 
##     3     1     2   792     1     1     2     1     3     1     2     2

Visualize the statistics

Below statistics are determined for all airlines, as well as per individual airline

for (i in 1:15)
  {


  hist(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]),
     freq=NULL,
     right=TRUE,
     col="red",
     plot=TRUE,
     labels=TRUE,
     main = paste("EHAM Departure Slot Variability to ",city[i],
                 "\n Mean",mean(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ])),
                 "\n SD"  ,sd(as.numeric(EHAM_outbound$BLOCK_DLY[EHAM_outbound$ADES_ICAO == city[i] ]))
                 ),
     xlab="Block Delay\n[Pos. is late]",
     breaks=150,
     xlim=range(-50,50))

  hist(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]),
     freq=NULL,
     right=TRUE,
     col="blue",
     plot=TRUE,
     labels=TRUE,
     main = paste("EHAM Arrival Slot Variability from ", city[i],
                  "\n Mean",mean(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ])),
                  "\n SD",sd(as.numeric(EHAM_inbound$BLOCK_DLY[EHAM_inbound$ADEP_ICAO == city[i] ]))                  ),
     xlab="Block Delay [Pos. is late]",
     breaks=150,
     xlim=range(-50,50))

  # plot block delays
  plot(EHAM_outbound$SCHED_TIME_UTC, EHAM_outbound$BLOCK_DLY, 
     type = "n",                   # blank plot for range / dimensions
     main = paste("EHAM Departure Slot Variability to ", city[i]),
     xlab = "Date",    # a nice x-axis label
     ylim = range(-75,75)
     )

    points(subset(EHAM_outbound, 
              ADES_ICAO == city[i],
                            select = c(BLOCK_TIME_UTC, BLOCK_DLY)
              ),
       col = "red"
       )

  # plot block delays
  plot(EHAM_inbound$SCHED_TIME_UTC, EHAM_inbound$BLOCK_DLY, 
     type = "n",                   # blank plot for range / dimensions
     main = paste("EHAM Arrival Slot Variability from ", city[i]),
     xlab = "Date",    # a nice x-axis label
     ylim = range(-75,75)
     )

    points(subset(EHAM_inbound, 
              ADEP_ICAO == city[i],
                            select = c(BLOCK_TIME_UTC, BLOCK_DLY)
              ),
       col = "blue"
       )


}

plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats plot of chunk runs_stats

Appendices

System Information

Sys.info()
##                      sysname                      release 
##                    "Windows"                         "XP" 
##                      version                     nodename 
## "build 2600, Service Pack 3"                  "KLM007683" 
##                      machine                        login 
##                        "x86"                   "klm35521" 
##                         user               effective_user 
##                   "klm35521"                   "klm35521"