Analyzing flight data

The following contain (fictional) data from 100 flights to two competing R pirate conventions taking place in Basel and Zurich. The flights all came from one of two airlines: Air Blackbeard (AB) and Scimitar Air (SA). The data are separated into four vectors:

airline <- c("AB", "AB", "AB", "AB", "AB", "AB", "AB", "AB", "AB", "SA", 
"AB", "AB", "AB", "AB", "SA", "AB", "SA", "SA", "AB", "SA", "AB", 
"SA", "AB", "SA", "AB", "AB", "SA", "AB", "AB", "AB", "AB", "AB", 
"AB", "AB", "SA", "SA", "SA", "SA", "SA", "AB", "SA", "AB", "AB", 
"SA", "SA", "AB", "SA", "SA", "AB", "SA", "AB", "SA", "AB", "SA", 
"SA", "SA", "SA", "SA", "AB", "AB", "AB", "SA", "SA", "SA", "AB", 
"SA", "AB", "SA", "AB", "AB", "SA", "SA", "SA", "SA", "AB", "AB", 
"SA", "AB", "AB", "SA", "AB", "SA", "AB", "SA", "AB", "SA", "SA", 
"AB", "AB", "SA", "AB", "SA", "AB", "SA", "AB", "SA", "SA", "SA", 
"AB", "SA", "AB", "AB", "AB", "AB", "AB", "AB", "SA", "SA", "SA", 
"SA", "SA", "AB", "SA", "AB", "AB", "AB", "AB", "SA", "SA", "SA", 
"SA", "AB", "AB", "SA", "SA", "SA", "AB", "AB", "SA", "SA", "SA", 
"AB", "AB", "AB", "SA", "SA", "AB", "SA", "AB", "AB", "AB", "AB", 
"AB", "SA", "SA", "SA", "AB", "SA", "SA", "SA", "AB", "SA", "SA", 
"AB", "AB", "AB", "SA", "AB", "AB", "SA", "SA", "AB", "SA", "SA", 
"AB", "SA", "AB", "AB", "SA", "SA", "SA", "SA", "AB", "AB", "SA", 
"SA", "AB", "SA", "AB", "SA", "AB", "AB", "SA", "SA", "SA", "SA", 
"SA", "AB", "SA", "SA", "AB", "AB", "AB", "AB", "SA", "SA", "AB", 
"SA", "SA", "SA")

destination <- c("ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "BSL", 
"BSL", "BSL", "ZRH", "ZRH", "ZRH", "BSL", "ZRH", "BSL", "BSL", 
"ZRH", "BSL", "ZRH", "BSL", "ZRH", "BSL", "ZRH", "BSL", "BSL", 
"ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "BSL", "BSL", 
"BSL", "BSL", "ZRH", "ZRH", "BSL", "ZRH", "BSL", "BSL", "BSL", 
"ZRH", "BSL", "BSL", "ZRH", "ZRH", "BSL", "BSL", "BSL", "BSL", 
"BSL", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "BSL", "ZRH", "BSL", 
"BSL", "ZRH", "ZRH", "ZRH", "BSL", "ZRH", "BSL", "BSL", "BSL", 
"BSL", "BSL", "ZRH", "BSL", "BSL", "ZRH", "ZRH", "ZRH", "ZRH", 
"BSL", "ZRH", "BSL", "ZRH", "ZRH", "BSL", "ZRH", "ZRH", "BSL", 
"BSL", "BSL", "ZRH", "ZRH", "ZRH", "BSL", "BSL", "BSL", "BSL", 
"BSL", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "BSL", "BSL", 
"BSL", "ZRH", "BSL", "ZRH", "BSL", "ZRH", "ZRH", "BSL", "ZRH", 
"ZRH", "BSL", "ZRH", "ZRH", "ZRH", "BSL", "ZRH", "BSL", "BSL", 
"ZRH", "ZRH", "BSL", "ZRH", "BSL", "ZRH", "BSL", "ZRH", "BSL", 
"BSL", "ZRH", "BSL", "BSL", "ZRH", "ZRH", "ZRH", "ZRH", "ZRH", 
"ZRH", "BSL", "ZRH", "BSL", "BSL", "BSL", "ZRH", "BSL", "BSL", 
"ZRH", "BSL", "BSL", "BSL", "ZRH", "ZRH", "BSL", "BSL", "ZRH", 
"BSL", "BSL", "ZRH", "BSL", "BSL", "ZRH", "BSL", "BSL", "ZRH", 
"BSL", "BSL", "ZRH", "BSL", "BSL", "ZRH", "BSL", "ZRH", "BSL", 
"BSL", "ZRH", "BSL", "BSL", "BSL", "BSL", "BSL", "ZRH", "BSL", 
"ZRH", "ZRH", "ZRH", "ZRH", "ZRH", "BSL", "BSL", "ZRH", "BSL", 
"BSL", "BSL")

delay <- c(12, 10, 8, 12, 9, 8, 10, 11, 1, 3, 1, 10, 11, 10, 3, 10, 1, 
4, 11, 4, 9, 2, 10, 2, 9, -1, 3, 10, 12, 9, 10, 9, 9, 9, 3, 4, 
2, 4, 13, 11, 3, 11, 1, 4, 4, 10, 3, 3, 9, 14, 0, 3, 0, 4, 3, 
13, 11, 13, 10, 10, 1, 13, 2, 2, 11, 14, 8, 2, 10, 0, 3, 3, 3, 
1, 12, 0, 4, 11, 10, 14, 11, 3, 9, 4, 10, 12, 2, 9, 11, 4, 1, 
2, 8, 14, 11, 2, 3, 5, -1, 2, 9, 8, 10, 11, 10, 10, 2, 4, 3, 
13, 3, 10, 2, 12, 13, 1, 9, 13, 3, 14, 12, 10, 1, 14, 5, 3, 12, 
9, 3, 13, 3, 10, 0, 9, 4, 4, 10, 5, -1, 10, 11, 9, 10, 12, 13, 
4, 10, 2, 3, 4, 11, 2, 5, 11, 0, 1, 5, 8, 10, 2, 5, 11, 3, 3, 
10, 4, 0, 8, 3, 3, 14, 3, -1, 9, 3, 3, 11, 3, 8, 3, 1, 10, 2, 
1, 4, 4, 3, 9, 3, 14, 10, 9, 10, 10, 2, 2, 10, 3, 3, 5)

flight.num <- c("AB.996", "AB.573", "AB.997", "AB.189", "AB.293", "AB.787", 
"AB.193", "AB.276", "AB.577", "SA.677", "AB.712", "AB.452", "AB.473", 
"AB.226", "SA.876", "AB.385", "SA.392", "SA.391", "AB.186", "SA.591", 
"AB.949", "SA.347", "AB.620", "SA.775", "AB.912", "AB.678", "SA.420", 
"AB.762", "AB.259", "AB.658", "AB.691", "AB.757", "AB.492", "AB.714", 
"SA.767", "SA.555", "SA.425", "SA.832", "SA.986", "AB.892", "SA.640", 
"AB.596", "AB.470", "SA.449", "SA.920", "AB.733", "SA.498", "SA.246", 
"AB.141", "SA.679", "AB.133", "SA.626", "AB.987", "SA.265", "SA.111", 
"SA.583", "SA.242", "SA.828", "AB.766", "AB.868", "AB.267", "SA.195", 
"SA.329", "SA.638", "AB.754", "SA.784", "AB.332", "SA.649", "AB.147", 
"AB.653", "SA.258", "SA.515", "SA.424", "SA.843", "AB.899", "AB.134", 
"SA.149", "AB.476", "AB.306", "SA.514", "AB.810", "SA.523", "AB.428", 
"SA.479", "AB.513", "SA.249", "SA.931", "AB.662", "AB.529", "SA.838", 
"AB.668", "SA.477", "AB.695", "SA.709", "AB.971", "SA.416", "SA.444", 
"SA.485", "AB.855", "SA.414", "AB.669", "AB.558", "AB.334", "AB.131", 
"AB.440", "AB.884", "SA.216", "SA.801", "SA.304", "SA.795", "SA.534", 
"AB.499", "SA.961", "AB.833", "AB.983", "AB.213", "AB.343", "SA.756", 
"SA.364", "SA.793", "SA.374", "AB.447", "AB.540", "SA.317", "SA.910", 
"SA.890", "AB.602", "AB.610", "SA.517", "SA.731", "SA.581", "AB.104", 
"AB.703", "AB.120", "SA.924", "SA.785", "AB.220", "SA.275", "AB.207", 
"AB.977", "AB.867", "AB.239", "AB.623", "SA.802", "SA.940", "SA.403", 
"AB.158", "SA.352", "SA.646", "SA.399", "AB.652", "SA.215", "SA.121", 
"AB.478", "AB.851", "AB.585", "SA.726", "AB.990", "AB.541", "SA.365", 
"SA.156", "AB.686", "SA.401", "SA.282", "AB.198", "SA.445", "AB.606", 
"AB.861", "SA.545", "SA.929", "SA.303", "SA.650", "AB.934", "AB.927", 
"SA.720", "SA.622", "AB.436", "SA.123", "AB.139", "SA.956", "AB.281", 
"AB.772", "SA.509", "SA.586", "SA.906", "SA.311", "SA.171", "AB.692", 
"SA.750", "SA.616", "AB.808", "AB.660", "AB.361", "AB.339", "SA.397", 
"SA.301", "AB.707", "SA.286", "SA.711", "SA.881")
  1. Store the data as vectors.

Numerical Indexing

    1. What was the delay of the 153rd flight on the list? b) What were the delays of the 5th, 32nd, and 88th flights on the list?
delay[153]
## [1] 5
delay[c(5, 32, 88)]
## [1] 9 9 9
  1. What were the airlines of the 10th through the 20th flights on the list?
airline[10:20]
##  [1] "SA" "AB" "AB" "AB" "AB" "SA" "AB" "SA" "SA" "AB" "SA"
  1. What was the destination of the last flight on the list (hint: don’t write the indexing number directly; instead, index the vector using the length() function with the appropriate argument)
destination[length(destination)]
## [1] "BSL"

Logical Indexing on one variable

  1. How many flights were from each airline? a) Use the table() function. b) Use logical indexing
table(airline)
## airline
##  AB  SA 
## 100 100
sum(airline == "AB")
## [1] 100
sum(airline == "SA")
## [1] 100
  1. How many flights were to each destination? a) Use the table() function. b) Use logical indexing
table(destination)
## destination
## BSL ZRH 
## 100 100
sum(destination == "BSL")
## [1] 100
sum(destination == "ZRH")
## [1] 100
  1. How many flights were there from airline Air Blackbeard? Use logical indexing
sum(airline == "AB")
## [1] 100
  1. How many flights were there to Basel? Use logical indexing.
sum(destination == "BSL")
## [1] 100
  1. What percent of flights had no delay?
mean(delay <= 0)
## [1] 0.055
    1. What percent of flights had a delay greater than 10 minutes? b) What percent of flights had a delay greater than 5 minutes but no more than 10 minutes?
mean(delay > 10)
## [1] 0.215
mean(delay > 5 & delay <= 10)
## [1] 0.285

Logical indexing and two variables

    1. What percent of all flights were Air Blackbeard flights to Basel? b) What was the standard deviation of the delays of these flights?
mean(airline == "AB" & destination == "BSL")
## [1] 0.1
sd(delay[airline == "AB" & destination == "BSL"])
## [1] 0.7863975
    1. What percent of all flights were Scimitar Air flights to Zurich? b) What was the largest delay of these flights?
mean(airline == "SA" & destination == "ZRH")
## [1] 0.1
max(delay[airline == "SA" & destination == "ZRH"])
## [1] 14
  1. What percent of Air Blackbeard’s flights had a delay greater than 10 minutes?
delay.ab <- delay[airline == "AB"]
mean(delay.ab > 10)
## [1] 0.23
  1. What was the median delay of flights to Basel that had a delay of EITHER less than -5 minutes or greater than 5 minutes?
median(delay[destination == "BSL" & (delay < -5 | delay > 5)])
## [1] NA
# There were no flights that satisfied these criteria! 
# To show this, let's take the sum of the logical vector.
# If it's 0, this means there were no eligble flights

sum(destination == "BSL" & (delay < -5 | delay > 5))
## [1] 0
  1. What were the flight numbers of all Scimitar Air flights that were late?
flight.num[airline == "SA" & delay > 0]
##   [1] "SA.677" "SA.876" "SA.392" "SA.391" "SA.591" "SA.347" "SA.775"
##   [8] "SA.420" "SA.767" "SA.555" "SA.425" "SA.832" "SA.986" "SA.640"
##  [15] "SA.449" "SA.920" "SA.498" "SA.246" "SA.679" "SA.626" "SA.265"
##  [22] "SA.111" "SA.583" "SA.242" "SA.828" "SA.195" "SA.329" "SA.638"
##  [29] "SA.784" "SA.649" "SA.258" "SA.515" "SA.424" "SA.843" "SA.149"
##  [36] "SA.514" "SA.523" "SA.479" "SA.249" "SA.931" "SA.838" "SA.477"
##  [43] "SA.709" "SA.416" "SA.444" "SA.485" "SA.414" "SA.216" "SA.801"
##  [50] "SA.304" "SA.795" "SA.534" "SA.961" "SA.756" "SA.364" "SA.793"
##  [57] "SA.374" "SA.317" "SA.910" "SA.890" "SA.517" "SA.731" "SA.581"
##  [64] "SA.924" "SA.785" "SA.275" "SA.802" "SA.940" "SA.403" "SA.352"
##  [71] "SA.646" "SA.399" "SA.215" "SA.121" "SA.726" "SA.365" "SA.156"
##  [78] "SA.401" "SA.282" "SA.445" "SA.545" "SA.929" "SA.303" "SA.650"
##  [85] "SA.720" "SA.622" "SA.123" "SA.956" "SA.509" "SA.586" "SA.906"
##  [92] "SA.311" "SA.171" "SA.750" "SA.616" "SA.397" "SA.301" "SA.286"
##  [99] "SA.711" "SA.881"

Solving a paradox…

    1. What was the mean delay of Air Blackbeard flights? b) What percent of these flights were NOT late?
mean(delay[airline == "AB"])
## [1] 8.02
mean(delay[airline == "AB"] <= 0)
## [1] 0.11
    1. What was the mean delay of Scimitar Air flights? b) What percent of these flights were NOT late?
mean(delay[airline == "SA"])
## [1] 5.11
mean(delay[airline == "SA"] <= 0)
## [1] 0
    1. What was the mean delay of all flights to Basel? b) What percent of flights to Basel WERE late?
mean(delay[destination == "BSL"])
## [1] 2.53
mean(delay[destination == "BSL"] > 0)
## [1] 0.89
    1. What was the mean delay of all flights to Zurich? b) What percent of flights to Zurich WERE late?
mean(delay[destination == "ZRH"])
## [1] 10.6
mean(delay[destination == "ZRH"] > 0)
## [1] 1
  1. Based on what you’ve learned so far, if you had to book an airline, which one would you choose?
# I'd pick Scimitar Air. The mean delay is smaller than Air Blackbeard
  1. What was the mean delay of Air Blackbeard flights TO Basel?
mean(delay[airline == "AB" & destination == "BSL"])
## [1] 0.25
  1. What was the mean delay of Air Blackbeard flights TO Zurich?
mean(delay[airline == "AB" & destination == "ZRH"])
## [1] 9.9625
  1. What was the mean delay of Scimitar Air flights TO Basel?
mean(delay[airline == "SA" & destination == "BSL"])
## [1] 3.1
  1. What was the mean delay Scimitar Air flights to Zurich?
mean(delay[airline == "SA" & destination == "ZRH"])
## [1] 13.15
  1. Based on what you’ve learned now, if you had to book an airline, which one would you choose?
# I'd pick Air Blackbeard! Its delays are shorter on average to both cities

What you’ve just seen is an example of Simpson’s Pardox. If you want to learn more, check out the wikipedia page.

# Crazy!

More fun!

  1. Oops, it turns out that Air Blackbeard was manipulating its data. In fact, all of its flights to Basel had an additional delay of 1 minute and all of its flights to Zurich had an additional delay of 3 minutes. Correct the data.
delay[airline == "AB" & destination == "BSL"] <- delay[airline == "AB" & destination == "BSL"] + 1
delay[airline == "AB" & destination == "ZRH"] <- delay[airline == "AB" & destination == "ZRH"] + 3

Run the following code to screw up some of the vector values:

delay[sample(1:length(delay), size = 4)] <- round(rnorm(n = 4, mean = -1000, sd = 5), 0)
airline[sample(1:length(delay), size = 4)] <- c("jkfl;ads", "hiuf", "niav", "jfuhio")

Well great, some of the delay and airline are now screwed up. Why did you do that?! You shouldn’t do everything you’re told. Now you have to fix it.

  1. Replace any delay values less than -30 min or greater than 30 minutes with NA values. Use logical indexing.
delay[delay < -30 | delay > 30] <- NA
  1. Replace any invalid airline values with NA. Use logical indexing.
airline[airline %in% c("AB", "SA") == FALSE] <- NA