This document is a first part of the exploratory analysis report regarding flight arrival and departure details in different USA airports.
Given the data about flight arrival and departure details in different airports, provide an understanding and insight about airline traffic, its evolution and problems.
# Loading libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(tidyr)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
# Importing the data
# Let's make our life easier by setting the folder containing .csv files as our working directory as this simplifies
# the procedure of reading in all the data into a single data frame.
setwd("C:/Users/User/Desktop/Microsoft_DataScience/AirportData/merge")
file_list <- list.files(pattern ="*.csv") # create list of all .csv files in folder
if(!exists("airp_data")){
airp_data <- rbindlist(lapply(file_list, function(file) {
dt = fread(file)
}))
}
##
Read 91.8% of 610037 rows
Read 610037 rows and 30 (of 30) columns from 0.066 GB file in 00:00:03
##
Read 79.1% of 606731 rows
Read 606731 rows and 30 (of 30) columns from 0.065 GB file in 00:00:03
##
Read 83.5% of 623107 rows
Read 623107 rows and 30 (of 30) columns from 0.067 GB file in 00:00:03
##
Read 85.4% of 609195 rows
Read 609195 rows and 30 (of 30) columns from 0.065 GB file in 00:00:03
##
Read 83.6% of 586197 rows
Read 586197 rows and 30 (of 30) columns from 0.063 GB file in 00:00:03
##
Read 68.4% of 628732 rows
Read 628732 rows and 30 (of 30) columns from 0.067 GB file in 00:00:03
##
Read 55.4% of 631609 rows
Read 631609 rows and 30 (of 30) columns from 0.068 GB file in 00:00:03
gc() # Force garbage collection to free up RAM
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7957046 425.0 12002346 641.0 8048768 429.9
## Vcells 648648047 4948.8 1328556216 10136.1 1293052099 9865.3
airp_data <- tbl_df(airp_data) # formating as tbl as I'm going to use 'dplyr' for data wrangling and analysis
# also we don't need the first column which is only the ordinal number of the given observation
airp_data <- airp_data[-1]
# Check the data to see whether it looks appropriate
head(airp_data, 10)
## # A tibble: 10 x 29
## Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2003 1 29 3 1651 1655 1912 1913
## 2 2003 1 30 4 1654 1655 1910 1913
## 3 2003 1 31 5 1724 1655 1936 1913
## 4 2003 1 1 3 1033 1035 1625 1634
## 5 2003 1 2 4 1053 1035 1726 1634
## 6 2003 1 3 5 1031 1035 1640 1634
## 7 2003 1 4 6 1031 1035 1626 1634
## 8 2003 1 5 7 1035 1035 1636 1634
## 9 2003 1 6 1 1031 1035 1653 1634
## 10 2003 1 1 3 1713 1710 1851 1847
## # ... with 21 more variables: UniqueCarrier <chr>, FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, CRSElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## # CarrierDelay <int>, WeatherDelay <int>, NASDelay <int>,
## # SecurityDelay <int>, LateAircraftDelay <int>
tail(airp_data, 10)
## # A tibble: 10 x 29
## Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2007 9 17 1 1913 1914 2026 2024
## 2 2007 9 17 1 1556 1558 1612 1619
## 3 2007 9 17 1 759 800 1505 1459
## 4 2007 9 17 1 1300 1300 1443 1442
## 5 2007 9 17 1 906 835 1035 1000
## 6 2007 9 17 1 857 859 1042 1050
## 7 2007 9 17 1 606 615 801 818
## 8 2007 9 17 1 908 910 910 928
## 9 2007 9 17 1 1512 1520 1648 1705
## 10 2007 9 17 1 631 635 1346 1357
## # ... with 21 more variables: UniqueCarrier <chr>, FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, CRSElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## # CarrierDelay <int>, WeatherDelay <int>, NASDelay <int>,
## # SecurityDelay <int>, LateAircraftDelay <int>
str(airp_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 35353543 obs. of 29 variables:
## $ Year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 29 30 31 1 2 3 4 5 6 1 ...
## $ DayOfWeek : int 3 4 5 3 4 5 6 7 1 3 ...
## $ DepTime : int 1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
## $ CRSDepTime : int 1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
## $ ArrTime : int 1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
## $ CRSArrTime : int 1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
## $ UniqueCarrier : chr "UA" "UA" "UA" "UA" ...
## $ FlightNum : int 1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
## $ TailNum : chr "N202UA" "N311UA" "N317UA" "N409UA" ...
## $ ActualElapsedTime: int 141 136 132 232 273 249 235 241 262 98 ...
## $ CRSElapsedTime : int 138 138 138 239 239 239 239 239 239 97 ...
## $ AirTime : int 119 108 110 215 214 223 219 227 241 62 ...
## $ ArrDelay : int -1 -3 23 -9 52 6 -8 2 19 4 ...
## $ DepDelay : int -4 -1 29 -2 18 -4 -4 0 -4 3 ...
## $ Origin : chr "ORD" "ORD" "ORD" "OAK" ...
## $ Dest : chr "MSY" "MSY" "MSY" "ORD" ...
## $ Distance : int 837 837 837 1835 1835 1835 1835 1835 1835 413 ...
## $ TaxiIn : int 5 2 5 6 13 13 5 5 7 7 ...
## $ TaxiOut : int 17 26 17 11 46 13 11 9 14 29 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr NA NA NA NA ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CarrierDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ WeatherDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ NASDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ SecurityDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ LateAircraftDelay: int NA NA NA NA NA NA NA NA NA NA ...
airp_data$DayofMonth[552109]
## [1] 17
airp_data$DayofMonth[552110]
## [1] 21
summary(airp_data$Distance) # Values seem logical so we'll say that data seems fine
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 310.0 556.0 719.5 948.0 4962.0
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 514230 27.5 9601876 512.8 8263341 441.4
## Vcells 605845051 4622.3 1328556216 10136.1 1293052099 9865.3
1. What is the average flight delay? How are flight delays distributed? 2. What is the median flight delay? How different is it from average flight delay? Whatconclusions can be drawn from that?
delays <- select(airp_data,ends_with("Delay")) # select columns wich contain time values in minutes for differnt types of delays
head(delays)
## # A tibble: 6 x 7
## ArrDelay DepDelay CarrierDelay WeatherDelay NASDelay SecurityDelay
## <int> <int> <int> <int> <int> <int>
## 1 -1 -4 NA NA NA NA
## 2 -3 -1 NA NA NA NA
## 3 23 29 NA NA NA NA
## 4 -9 -2 NA NA NA NA
## 5 52 18 NA NA NA NA
## 6 6 -4 NA NA NA NA
## # ... with 1 more variables: LateAircraftDelay <int>
# Let's right away how five number summary of delay times according to different types looks like
summary(delays, na.rm=TRUE)
## ArrDelay DepDelay CarrierDelay WeatherDelay
## Min. :-1302.0 Min. :-1410.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: -9.0 1st Qu.: -4.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : -1.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 7.3 Mean : 8.7 Mean : 3.1 Mean : 0.7
## 3rd Qu.: 11.0 3rd Qu.: 7.0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. : 2598.0 Max. : 2601.0 Max. :2580.0 Max. :1510.0
## NA's :718197 NA's :645638 NA's :2672742 NA's :2672742
## NASDelay SecurityDelay LateAircraftDelay
## Min. : -60.0 Min. : 0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0
## Median : 0.0 Median : 0 Median : 0
## Mean : 3.5 Mean : 0 Mean : 4
## 3rd Qu.: 0.0 3rd Qu.: 0 3rd Qu.: 0
## Max. :1392.0 Max. :533 Max. :1407
## NA's :2672742 NA's :2672742 NA's :2672742
# Long dataset format would be more appropriate for further analysis and graphic representations exploiting ggplot2.
# It is also convinient to add a column which holds dealy types as factors for the purpose mentioned above.
delays_long<- gather(delays, delay_type, duration) %>% mutate(delay_type_factor = factor(delay_type))
head(delays_long)
## # A tibble: 6 x 3
## delay_type duration delay_type_factor
## <chr> <int> <fctr>
## 1 ArrDelay -1 ArrDelay
## 2 ArrDelay -3 ArrDelay
## 3 ArrDelay 23 ArrDelay
## 4 ArrDelay -9 ArrDelay
## 5 ArrDelay 52 ArrDelay
## 6 ArrDelay 6 ArrDelay
rm(delays) # free the RAM
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 520363 27.8 6145200 328.2 8263341 441.4
## Vcells 1100801834 8398.5 1643352791 12537.8 1616711650 12334.6
# Let's see in a bit tidier manner what are the average and median values for
# delay times (in minutes) according to delay type.
delays_long %>% filter(!is.na(duration)) %>%
group_by(delay_type) %>%
summarise(average_delay = mean(duration), median_delay=median(duration))
## # A tibble: 7 x 3
## delay_type average_delay median_delay
## <chr> <dbl> <dbl>
## 1 ArrDelay 7.32170887 -1
## 2 CarrierDelay 3.11825163 0
## 3 DepDelay 8.74566823 0
## 4 LateAircraftDelay 4.00899917 0
## 5 NASDelay 3.49417901 0
## 6 SecurityDelay 0.02445485 0
## 7 WeatherDelay 0.68333383 0
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 521203 27.9 3146342 168.1 8263341 441.4
## Vcells 1100802431 8398.5 2434051258 18570.4 2403246382 18335.4
As you can see median delay time for all delay types is 0 except for ArrDelay which stands for arrival delay, in minutes. This due to the fact that this type of delay can have negative values which reflect the fact that airplane arrived earlier than expected. All other zero median values tell us that more than 50% of all these delays is equal to 0, i.e., there was no delay of the given type.
Furhter we are going to give visual representation of distributions of delay times according to delay type with two graphs namely box-plot and density plot.
# Graphic representation of delay time distributions by delay type
for_plot<- delays_long %>% filter(!is.na(duration))
rm(delays_long) # intensive memory menagement is needed due to the shear size of data sets
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 521063 27.9 2517073 134.5 8263341 441.4
## Vcells 1071347755 8173.8 2434051258 18570.4 2403246382 18335.4
# Throwing away outliers before plotting
filter(for_plot, duration > 0 & duration < 200) %>% ggplot(aes(x = delay_type_factor, y = duration, fill= delay_type_factor)) +
geom_boxplot()+
labs(title="Distribution of delay times according to delay type - box-plots",
x="Delay type", y="Duration")
filter(for_plot, duration > 0 & duration < 300) %>% ggplot(aes(x=duration, fill=delay_type_factor, linetype=delay_type_factor)) +
geom_density(alpha=.3) +
labs(title="Distribution of delay times according to delay type - density plots",
x="Duration", y="Density")
Let’s see what is the overall average and median delay time:
summarise(for_plot, average_delay=mean(duration), median_delay=median(duration))
## # A tibble: 1 x 2
## average_delay median_delay
## <dbl> <dbl>
## 1 3.984501 0
As can be seen overall median delay time is 0 min, while overall average delay time is around 4 minutes. This means that overall more than 50% of all delays is equal to zero. The overall average delay of 4 minutes reflects the fact that although there are also some negative values of delays, which belog to ArrDelay type, there is a sufficcient number of outliers, i.e. relatively large values of delay, to give this average value.
Find a linear approximation of total yearly number of flights. Estimate the error of your approximation.
This question will be treated both in terms of linear modeling, as well as classical theory of measurement.
flights_year<-airp_data %>% group_by(Year)%>%summarise(num=n())
year=c(2003, 2004, 2005, 2006, 2007)
plot(year,flights_year$num/10000, xlab = 'Year', ylab = 'Yearly number of flights (in 10000s)', main = 'Linear approximation of #flights/year')
lin_mod <- lm(flights_year$num/10000~year)
abline(lin_mod, col= 'red')
summary(lin_mod)
##
## Call:
## lm(formula = flights_year$num/10000 ~ year)
##
## Residuals:
## 1 2 3 4 5
## -19.3768 25.2762 6.9887 -12.2987 -0.5894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -38230.069 12758.657 -2.996 0.0578 .
## year 19.420 6.363 3.052 0.0553 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.12 on 3 degrees of freedom
## Multiple R-squared: 0.7564, Adjusted R-squared: 0.6752
## F-statistic: 9.314 on 1 and 3 DF, p-value: 0.05535
# So residual standard error is 20.12, while multiple R-squared is 0.7564, which is not bad for this simple model.
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 591216 31.6 2013658 107.6 8263341 441.4
## Vcells 1148772921 8764.5 2434051258 18570.4 2432975104 18562.2
# If you think in terms of measurement theory standard error of measurement is defined
# as (standard deviation of measured values)/sqrt(# of measurements taken)
average_num_flights<-mean(flights_year$num/10000) #average measured value
average_num_flights
## [1] 707.0709
std_flights<-sd(flights_year$num/10000) #standard deviation of repeated measurements
std_flights
## [1] 35.30639
SE<-std_flights/sqrt(nrow(flights_year)) # standard error of measurement
SE #value of SE of 5 measurements (in 10000s) - as expected is smaller but comperable to SE previously obtained from the linear model
## [1] 15.7895
Which carrier is the most/least late on average? Which carrier has the lowest/highest variance of delays?
str(airp_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 35353543 obs. of 29 variables:
## $ Year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 29 30 31 1 2 3 4 5 6 1 ...
## $ DayOfWeek : int 3 4 5 3 4 5 6 7 1 3 ...
## $ DepTime : int 1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
## $ CRSDepTime : int 1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
## $ ArrTime : int 1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
## $ CRSArrTime : int 1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
## $ UniqueCarrier : chr "UA" "UA" "UA" "UA" ...
## $ FlightNum : int 1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
## $ TailNum : chr "N202UA" "N311UA" "N317UA" "N409UA" ...
## $ ActualElapsedTime: int 141 136 132 232 273 249 235 241 262 98 ...
## $ CRSElapsedTime : int 138 138 138 239 239 239 239 239 239 97 ...
## $ AirTime : int 119 108 110 215 214 223 219 227 241 62 ...
## $ ArrDelay : int -1 -3 23 -9 52 6 -8 2 19 4 ...
## $ DepDelay : int -4 -1 29 -2 18 -4 -4 0 -4 3 ...
## $ Origin : chr "ORD" "ORD" "ORD" "OAK" ...
## $ Dest : chr "MSY" "MSY" "MSY" "ORD" ...
## $ Distance : int 837 837 837 1835 1835 1835 1835 1835 1835 413 ...
## $ TaxiIn : int 5 2 5 6 13 13 5 5 7 7 ...
## $ TaxiOut : int 17 26 17 11 46 13 11 9 14 29 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr NA NA NA NA ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CarrierDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ WeatherDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ NASDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ SecurityDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ LateAircraftDelay: int NA NA NA NA NA NA NA NA NA NA ...
summary <- airp_data %>%
select(UniqueCarrier, ends_with("Delay")) %>%
gather(delay_type, value, -UniqueCarrier) %>%
filter(!is.na(value))%>%
group_by(UniqueCarrier) %>%
summarise(average_late=mean(value), variance_delays=var(value)) %>%
summarise(most_late=max(average_late), least_late=min(average_late),
low_var=min(variance_delays), high_var=max(variance_delays))
summary
## # A tibble: 1 x 4
## most_late least_late low_var high_var
## <dbl> <dbl> <dbl> <dbl>
## 1 5.722392 0.1017696 127.6163 766.9082
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 590606 31.6 2013658 107.6 8263341 441.4
## Vcells 1148764463 8764.4 2967937229 22643.6 2908932643 22193.4
airp_data %>%
select(UniqueCarrier, ends_with("Delay")) %>%
gather(delay_type, value, -UniqueCarrier) %>%
filter(!is.na(value))%>%
group_by(UniqueCarrier) %>%
summarise(average_late=mean(value), variance_delays=var(value)) %>%
filter(average_late==summary$most_late | average_late==summary$least_late |
variance_delays==summary$low_var | variance_delays==summary$high_var)
## # A tibble: 4 x 3
## UniqueCarrier average_late variance_delays
## <chr> <dbl> <dbl>
## 1 AQ 0.3110138 127.6163
## 2 EV 5.7223917 637.1085
## 3 HA 0.1017696 217.4109
## 4 YV 5.6020309 766.9082
if(!exists("carriers")) {
carriers<-read_csv("C:/Users/User/Desktop/Microsoft_DataScience/AirportData/carriers.csv")
}
head(carriers)
## # A tibble: 6 x 2
## Code Description
## <chr> <chr>
## 1 02Q Titan Airways
## 2 04Q Tradewind Aviation
## 3 05Q Comlux Aviation, AG
## 4 06Q Master Top Linhas Aereas Ltd.
## 5 07Q Flair Airlines Ltd.
## 6 09Q Swift Air, LLC
filter(carriers, Code=="AQ" | Code=="EV" | Code=="HA" | Code=="YV")
## # A tibble: 4 x 2
## Code Description
## <chr> <chr>
## 1 AQ Aloha Airlines Inc.
## 2 EV Atlantic Southeast Airlines
## 3 HA Hawaiian Airlines Inc.
## 4 YV Mesa Airlines Inc.
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 602978 32.3 2013658 107.6 8263341 441.4
## Vcells 1148797189 8764.7 2967937229 22643.6 2945932477 22475.7
So the carrier with the least average delay time is Hawaiian Airlines Inc.,the carrier which is most late on average is Atlantic Southeast Airlines, the carrier wiith the lowest variance on delay time is Aloha Airlines Inc. and the carrier with the highest variance on delay time is Mesa Airlines Inc.