Introduction

This document is a first part of the exploratory analysis report regarding flight arrival and departure details in different USA airports.

Problem statement

Given the data about flight arrival and departure details in different airports, provide an understanding and insight about airline traffic, its evolution and problems.

Initial preparation - loading necessary packages and importing data

# Loading libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(tidyr)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, last
# Importing the data

# Let's make our life easier by setting the folder containing .csv files as our working directory as this simplifies 
# the procedure of reading in all the data into a single data frame.

setwd("C:/Users/User/Desktop/Microsoft_DataScience/AirportData/merge")


file_list <- list.files(pattern ="*.csv") # create list of all .csv files in folder

if(!exists("airp_data")){

airp_data  <- rbindlist(lapply(file_list, function(file) {
  dt = fread(file)
}))
}
## 
Read 91.8% of 610037 rows
Read 610037 rows and 30 (of 30) columns from 0.066 GB file in 00:00:03
## 
Read 79.1% of 606731 rows
Read 606731 rows and 30 (of 30) columns from 0.065 GB file in 00:00:03
## 
Read 83.5% of 623107 rows
Read 623107 rows and 30 (of 30) columns from 0.067 GB file in 00:00:03
## 
Read 85.4% of 609195 rows
Read 609195 rows and 30 (of 30) columns from 0.065 GB file in 00:00:03
## 
Read 83.6% of 586197 rows
Read 586197 rows and 30 (of 30) columns from 0.063 GB file in 00:00:03
## 
Read 68.4% of 628732 rows
Read 628732 rows and 30 (of 30) columns from 0.067 GB file in 00:00:03
## 
Read 55.4% of 631609 rows
Read 631609 rows and 30 (of 30) columns from 0.068 GB file in 00:00:03
gc() # Force garbage collection to free up RAM
##             used   (Mb) gc trigger    (Mb)   max used   (Mb)
## Ncells   7957046  425.0   12002346   641.0    8048768  429.9
## Vcells 648648047 4948.8 1328556216 10136.1 1293052099 9865.3
airp_data <- tbl_df(airp_data) # formating as tbl as I'm going to use 'dplyr' for data wrangling and analysis 

# also we don't need the first column which is only the ordinal number of the given observation
airp_data <- airp_data[-1]

# Check the data to see whether it looks appropriate 
head(airp_data, 10)
## # A tibble: 10 x 29
##     Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
##    <int> <int>      <int>     <int>   <int>      <int>   <int>      <int>
## 1   2003     1         29         3    1651       1655    1912       1913
## 2   2003     1         30         4    1654       1655    1910       1913
## 3   2003     1         31         5    1724       1655    1936       1913
## 4   2003     1          1         3    1033       1035    1625       1634
## 5   2003     1          2         4    1053       1035    1726       1634
## 6   2003     1          3         5    1031       1035    1640       1634
## 7   2003     1          4         6    1031       1035    1626       1634
## 8   2003     1          5         7    1035       1035    1636       1634
## 9   2003     1          6         1    1031       1035    1653       1634
## 10  2003     1          1         3    1713       1710    1851       1847
## # ... with 21 more variables: UniqueCarrier <chr>, FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, CRSElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## #   CarrierDelay <int>, WeatherDelay <int>, NASDelay <int>,
## #   SecurityDelay <int>, LateAircraftDelay <int>
tail(airp_data, 10)
## # A tibble: 10 x 29
##     Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
##    <int> <int>      <int>     <int>   <int>      <int>   <int>      <int>
## 1   2007     9         17         1    1913       1914    2026       2024
## 2   2007     9         17         1    1556       1558    1612       1619
## 3   2007     9         17         1     759        800    1505       1459
## 4   2007     9         17         1    1300       1300    1443       1442
## 5   2007     9         17         1     906        835    1035       1000
## 6   2007     9         17         1     857        859    1042       1050
## 7   2007     9         17         1     606        615     801        818
## 8   2007     9         17         1     908        910     910        928
## 9   2007     9         17         1    1512       1520    1648       1705
## 10  2007     9         17         1     631        635    1346       1357
## # ... with 21 more variables: UniqueCarrier <chr>, FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, CRSElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## #   CarrierDelay <int>, WeatherDelay <int>, NASDelay <int>,
## #   SecurityDelay <int>, LateAircraftDelay <int>
str(airp_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    35353543 obs. of  29 variables:
##  $ Year             : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  29 30 31 1 2 3 4 5 6 1 ...
##  $ DayOfWeek        : int  3 4 5 3 4 5 6 7 1 3 ...
##  $ DepTime          : int  1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
##  $ CRSDepTime       : int  1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
##  $ ArrTime          : int  1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
##  $ CRSArrTime       : int  1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
##  $ UniqueCarrier    : chr  "UA" "UA" "UA" "UA" ...
##  $ FlightNum        : int  1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
##  $ TailNum          : chr  "N202UA" "N311UA" "N317UA" "N409UA" ...
##  $ ActualElapsedTime: int  141 136 132 232 273 249 235 241 262 98 ...
##  $ CRSElapsedTime   : int  138 138 138 239 239 239 239 239 239 97 ...
##  $ AirTime          : int  119 108 110 215 214 223 219 227 241 62 ...
##  $ ArrDelay         : int  -1 -3 23 -9 52 6 -8 2 19 4 ...
##  $ DepDelay         : int  -4 -1 29 -2 18 -4 -4 0 -4 3 ...
##  $ Origin           : chr  "ORD" "ORD" "ORD" "OAK" ...
##  $ Dest             : chr  "MSY" "MSY" "MSY" "ORD" ...
##  $ Distance         : int  837 837 837 1835 1835 1835 1835 1835 1835 413 ...
##  $ TaxiIn           : int  5 2 5 6 13 13 5 5 7 7 ...
##  $ TaxiOut          : int  17 26 17 11 46 13 11 9 14 29 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  NA NA NA NA ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CarrierDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WeatherDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ NASDelay         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SecurityDelay    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LateAircraftDelay: int  NA NA NA NA NA NA NA NA NA NA ...
airp_data$DayofMonth[552109]
## [1] 17
airp_data$DayofMonth[552110]
## [1] 21
summary(airp_data$Distance) # Values seem logical so we'll say that data seems fine
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     6.0   310.0   556.0   719.5   948.0  4962.0
gc()
##             used   (Mb) gc trigger    (Mb)   max used   (Mb)
## Ncells    514230   27.5    9601876   512.8    8263341  441.4
## Vcells 605845051 4622.3 1328556216 10136.1 1293052099 9865.3

Answers to the given questions

Questions 1 and 2

1. What is the average flight delay? How are flight delays distributed? 2. What is the median flight delay? How different is it from average flight delay? Whatconclusions can be drawn from that?

delays <- select(airp_data,ends_with("Delay")) # select columns wich contain time values in minutes for differnt types                                                   of delays
head(delays) 
## # A tibble: 6 x 7
##   ArrDelay DepDelay CarrierDelay WeatherDelay NASDelay SecurityDelay
##      <int>    <int>        <int>        <int>    <int>         <int>
## 1       -1       -4           NA           NA       NA            NA
## 2       -3       -1           NA           NA       NA            NA
## 3       23       29           NA           NA       NA            NA
## 4       -9       -2           NA           NA       NA            NA
## 5       52       18           NA           NA       NA            NA
## 6        6       -4           NA           NA       NA            NA
## # ... with 1 more variables: LateAircraftDelay <int>
# Let's right away how five number summary of delay times according to different types looks like
summary(delays, na.rm=TRUE)
##     ArrDelay          DepDelay        CarrierDelay      WeatherDelay    
##  Min.   :-1302.0   Min.   :-1410.0   Min.   :   0.0    Min.   :   0.0   
##  1st Qu.:   -9.0   1st Qu.:   -4.0   1st Qu.:   0.0    1st Qu.:   0.0   
##  Median :   -1.0   Median :    0.0   Median :   0.0    Median :   0.0   
##  Mean   :    7.3   Mean   :    8.7   Mean   :   3.1    Mean   :   0.7   
##  3rd Qu.:   11.0   3rd Qu.:    7.0   3rd Qu.:   0.0    3rd Qu.:   0.0   
##  Max.   : 2598.0   Max.   : 2601.0   Max.   :2580.0    Max.   :1510.0   
##  NA's   :718197    NA's   :645638    NA's   :2672742   NA's   :2672742  
##     NASDelay       SecurityDelay     LateAircraftDelay
##  Min.   : -60.0    Min.   :  0       Min.   :   0     
##  1st Qu.:   0.0    1st Qu.:  0       1st Qu.:   0     
##  Median :   0.0    Median :  0       Median :   0     
##  Mean   :   3.5    Mean   :  0       Mean   :   4     
##  3rd Qu.:   0.0    3rd Qu.:  0       3rd Qu.:   0     
##  Max.   :1392.0    Max.   :533       Max.   :1407     
##  NA's   :2672742   NA's   :2672742   NA's   :2672742
# Long dataset format would be more appropriate for further analysis and graphic representations exploiting ggplot2.
# It is also convinient to add a column which holds dealy types as factors for the purpose mentioned above.
delays_long<- gather(delays, delay_type, duration) %>% mutate(delay_type_factor = factor(delay_type))
head(delays_long)
## # A tibble: 6 x 3
##   delay_type duration delay_type_factor
##        <chr>    <int>            <fctr>
## 1   ArrDelay       -1          ArrDelay
## 2   ArrDelay       -3          ArrDelay
## 3   ArrDelay       23          ArrDelay
## 4   ArrDelay       -9          ArrDelay
## 5   ArrDelay       52          ArrDelay
## 6   ArrDelay        6          ArrDelay
rm(delays) # free the RAM
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     520363   27.8    6145200   328.2    8263341   441.4
## Vcells 1100801834 8398.5 1643352791 12537.8 1616711650 12334.6
# Let's see in a bit tidier manner what are the average and median values for
# delay times (in minutes) according to delay type. 

delays_long %>% filter(!is.na(duration)) %>%
                         group_by(delay_type) %>% 
                         summarise(average_delay = mean(duration), median_delay=median(duration))
## # A tibble: 7 x 3
##          delay_type average_delay median_delay
##               <chr>         <dbl>        <dbl>
## 1          ArrDelay    7.32170887           -1
## 2      CarrierDelay    3.11825163            0
## 3          DepDelay    8.74566823            0
## 4 LateAircraftDelay    4.00899917            0
## 5          NASDelay    3.49417901            0
## 6     SecurityDelay    0.02445485            0
## 7      WeatherDelay    0.68333383            0
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     521203   27.9    3146342   168.1    8263341   441.4
## Vcells 1100802431 8398.5 2434051258 18570.4 2403246382 18335.4

As you can see median delay time for all delay types is 0 except for ArrDelay which stands for arrival delay, in minutes. This due to the fact that this type of delay can have negative values which reflect the fact that airplane arrived earlier than expected. All other zero median values tell us that more than 50% of all these delays is equal to 0, i.e., there was no delay of the given type.

Furhter we are going to give visual representation of distributions of delay times according to delay type with two graphs namely box-plot and density plot.

# Graphic representation of delay time distributions by delay type
for_plot<- delays_long %>% filter(!is.na(duration))

rm(delays_long) # intensive memory menagement is needed due to the shear size of data sets
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     521063   27.9    2517073   134.5    8263341   441.4
## Vcells 1071347755 8173.8 2434051258 18570.4 2403246382 18335.4
# Throwing away outliers before plotting

filter(for_plot, duration > 0 & duration < 200) %>% ggplot(aes(x = delay_type_factor, y = duration,                                                                                fill= delay_type_factor)) + 
                                                            geom_boxplot()+
                                                            labs(title="Distribution of delay times according to delay                                                                         type - box-plots",
                                                                        x="Delay type", y="Duration")

filter(for_plot, duration > 0 & duration < 300) %>% ggplot(aes(x=duration, fill=delay_type_factor,                                                                                linetype=delay_type_factor)) + 
                                                           geom_density(alpha=.3) + 
                                                           labs(title="Distribution of delay times according to delay                                                                         type - density plots",
                                                                       x="Duration", y="Density") 

Let’s see what is the overall average and median delay time:

summarise(for_plot, average_delay=mean(duration), median_delay=median(duration)) 
## # A tibble: 1 x 2
##   average_delay median_delay
##           <dbl>        <dbl>
## 1      3.984501            0

As can be seen overall median delay time is 0 min, while overall average delay time is around 4 minutes. This means that overall more than 50% of all delays is equal to zero. The overall average delay of 4 minutes reflects the fact that although there are also some negative values of delays, which belog to ArrDelay type, there is a sufficcient number of outliers, i.e. relatively large values of delay, to give this average value.

Question 3.

Find a linear approximation of total yearly number of flights. Estimate the error of your approximation.

This question will be treated both in terms of linear modeling, as well as classical theory of measurement.

flights_year<-airp_data %>% group_by(Year)%>%summarise(num=n())

year=c(2003, 2004, 2005, 2006, 2007)

plot(year,flights_year$num/10000, xlab = 'Year', ylab = 'Yearly number of flights (in 10000s)', main = 'Linear approximation of #flights/year')

lin_mod <- lm(flights_year$num/10000~year)

abline(lin_mod, col= 'red')

summary(lin_mod)
## 
## Call:
## lm(formula = flights_year$num/10000 ~ year)
## 
## Residuals:
##        1        2        3        4        5 
## -19.3768  25.2762   6.9887 -12.2987  -0.5894 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -38230.069  12758.657  -2.996   0.0578 .
## year            19.420      6.363   3.052   0.0553 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.12 on 3 degrees of freedom
## Multiple R-squared:  0.7564, Adjusted R-squared:  0.6752 
## F-statistic: 9.314 on 1 and 3 DF,  p-value: 0.05535
# So residual standard error is 20.12, while multiple R-squared is 0.7564, which is not bad for this simple model.

gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     591216   31.6    2013658   107.6    8263341   441.4
## Vcells 1148772921 8764.5 2434051258 18570.4 2432975104 18562.2
# If you think in terms of measurement theory standard error of measurement is defined 
# as (standard deviation of measured values)/sqrt(# of measurements taken)

average_num_flights<-mean(flights_year$num/10000) #average measured value

average_num_flights 
## [1] 707.0709
std_flights<-sd(flights_year$num/10000) #standard deviation of repeated measurements

std_flights 
## [1] 35.30639
SE<-std_flights/sqrt(nrow(flights_year)) # standard error of measurement

SE #value of SE of 5 measurements (in 10000s) - as expected is smaller but comperable to SE previously obtained from the linear model
## [1] 15.7895

Question 4.

Which carrier is the most/least late on average? Which carrier has the lowest/highest variance of delays?

str(airp_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    35353543 obs. of  29 variables:
##  $ Year             : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  29 30 31 1 2 3 4 5 6 1 ...
##  $ DayOfWeek        : int  3 4 5 3 4 5 6 7 1 3 ...
##  $ DepTime          : int  1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
##  $ CRSDepTime       : int  1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
##  $ ArrTime          : int  1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
##  $ CRSArrTime       : int  1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
##  $ UniqueCarrier    : chr  "UA" "UA" "UA" "UA" ...
##  $ FlightNum        : int  1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
##  $ TailNum          : chr  "N202UA" "N311UA" "N317UA" "N409UA" ...
##  $ ActualElapsedTime: int  141 136 132 232 273 249 235 241 262 98 ...
##  $ CRSElapsedTime   : int  138 138 138 239 239 239 239 239 239 97 ...
##  $ AirTime          : int  119 108 110 215 214 223 219 227 241 62 ...
##  $ ArrDelay         : int  -1 -3 23 -9 52 6 -8 2 19 4 ...
##  $ DepDelay         : int  -4 -1 29 -2 18 -4 -4 0 -4 3 ...
##  $ Origin           : chr  "ORD" "ORD" "ORD" "OAK" ...
##  $ Dest             : chr  "MSY" "MSY" "MSY" "ORD" ...
##  $ Distance         : int  837 837 837 1835 1835 1835 1835 1835 1835 413 ...
##  $ TaxiIn           : int  5 2 5 6 13 13 5 5 7 7 ...
##  $ TaxiOut          : int  17 26 17 11 46 13 11 9 14 29 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  NA NA NA NA ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CarrierDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WeatherDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ NASDelay         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SecurityDelay    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LateAircraftDelay: int  NA NA NA NA NA NA NA NA NA NA ...
summary <- airp_data %>% 
  select(UniqueCarrier, ends_with("Delay")) %>%
  gather(delay_type, value, -UniqueCarrier) %>% 
  filter(!is.na(value))%>%
  group_by(UniqueCarrier) %>% 
  summarise(average_late=mean(value), variance_delays=var(value)) %>% 
  summarise(most_late=max(average_late), least_late=min(average_late), 
             low_var=min(variance_delays), high_var=max(variance_delays))

summary
## # A tibble: 1 x 4
##   most_late least_late  low_var high_var
##       <dbl>      <dbl>    <dbl>    <dbl>
## 1  5.722392  0.1017696 127.6163 766.9082
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     590606   31.6    2013658   107.6    8263341   441.4
## Vcells 1148764463 8764.4 2967937229 22643.6 2908932643 22193.4
airp_data %>% 
  select(UniqueCarrier, ends_with("Delay")) %>%
  gather(delay_type, value, -UniqueCarrier) %>% 
  filter(!is.na(value))%>%
  group_by(UniqueCarrier) %>% 
  summarise(average_late=mean(value), variance_delays=var(value)) %>% 
  filter(average_late==summary$most_late | average_late==summary$least_late |
         variance_delays==summary$low_var | variance_delays==summary$high_var)
## # A tibble: 4 x 3
##   UniqueCarrier average_late variance_delays
##           <chr>        <dbl>           <dbl>
## 1            AQ    0.3110138        127.6163
## 2            EV    5.7223917        637.1085
## 3            HA    0.1017696        217.4109
## 4            YV    5.6020309        766.9082
if(!exists("carriers")) {
  carriers<-read_csv("C:/Users/User/Desktop/Microsoft_DataScience/AirportData/carriers.csv")
}

head(carriers)
## # A tibble: 6 x 2
##    Code                   Description
##   <chr>                         <chr>
## 1   02Q                 Titan Airways
## 2   04Q            Tradewind Aviation
## 3   05Q           Comlux Aviation, AG
## 4   06Q Master Top Linhas Aereas Ltd.
## 5   07Q           Flair Airlines Ltd.
## 6   09Q                Swift Air, LLC
filter(carriers, Code=="AQ" | Code=="EV" | Code=="HA" | Code=="YV")
## # A tibble: 4 x 2
##    Code                 Description
##   <chr>                       <chr>
## 1    AQ         Aloha Airlines Inc.
## 2    EV Atlantic Southeast Airlines
## 3    HA      Hawaiian Airlines Inc.
## 4    YV          Mesa Airlines Inc.
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     602978   32.3    2013658   107.6    8263341   441.4
## Vcells 1148797189 8764.7 2967937229 22643.6 2945932477 22475.7

So the carrier with the least average delay time is Hawaiian Airlines Inc.,the carrier which is most late on average is Atlantic Southeast Airlines, the carrier wiith the lowest variance on delay time is Aloha Airlines Inc. and the carrier with the highest variance on delay time is Mesa Airlines Inc.