library(tidymodels)
library(tidyverse)
library(skimr)
library(nycflights13)
library(timeDate)
library(lubridate)
library(maps)
library(patchwork)
library(corrplot)
library(GGally)
library(rpart.plot)
library(vip)

theme_set(theme_bw())

Here we will look at the NYC flight data from the nycflights13 package. The problem to investigate is to come up with a model that can predict when a flight’s arrival time will be delayed. This first document will mainly be exploring the data. The second document will perform machine learning.

1 Data

1.1 Flights Data

skim(flights)

Data summary
Name	flights
Number of rows	336776
Number of columns	19
_______________________
Column type frequency:
character	4
numeric	14
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
carrier	0	1.00	2	2	16
tailnum	2512	0.99	5	6	4043
origin	0	1.00	3	3	3
dest	0	1.00	3	3	105

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013	2013	2013	2013	2013	▁▁▇▁▁
month	0	1.00	6.55	3.41	1	4	7	10	12	▇▆▆▆▇
day	0	1.00	15.71	8.77	1	8	16	23	31	▇▇▇▇▆
dep_time	8255	0.98	1349.11	488.28	1	907	1401	1744	2400	▁▇▆▇▃
sched_dep_time	0	1.00	1344.25	467.34	106	906	1359	1729	2359	▁▇▇▇▃
dep_delay	8255	0.98	12.64	40.21	-43	-5	-2	11	1301	▇▁▁▁▁
arr_time	8713	0.97	1502.05	533.26	1	1104	1535	1940	2400	▁▃▇▇▇
sched_arr_time	0	1.00	1536.38	497.46	1	1124	1556	1945	2359	▁▃▇▇▇
arr_delay	9430	0.97	6.90	44.63	-86	-17	-5	14	1272	▇▁▁▁▁
flight	0	1.00	1971.92	1632.47	1	553	1496	3465	8500	▇▃▃▁▁
air_time	9430	0.97	150.69	93.69	20	82	129	192	695	▇▂▂▁▁
distance	0	1.00	1039.91	733.23	17	502	872	1389	4983	▇▃▂▁▁
hour	0	1.00	13.18	4.66	1	9	13	17	23	▁▇▇▇▅
minute	0	1.00	26.23	19.30	0	8	29	44	59	▇▃▆▃▅

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-31 23:00:00	2013-07-03 10:00:00	6936

Let’s look at the NA value

flights %>%
  filter ( is.na(arr_delay)   )

## # A tibble: 9,430 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1525           1530        -5     1934           1805
##  2  2013     1     1     1528           1459        29     2002           1647
##  3  2013     1     1     1740           1745        -5     2158           2020
##  4  2013     1     1     1807           1738        29     2251           2103
##  5  2013     1     1     1939           1840        59       29           2151
##  6  2013     1     1     1952           1930        22     2358           2207
##  7  2013     1     1     2016           1930        46       NA           2220
##  8  2013     1     1       NA           1630        NA       NA           1815
##  9  2013     1     1       NA           1935        NA       NA           2240
## 10  2013     1     1       NA           1500        NA       NA           1825
## # ... with 9,420 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

It appears there are many flights with missing data on the flight arrival times. We will omit these entires. But there are methods that can be used to impute missing values.

1.2 Airports Data

Let’s look at the airports

skim(airports)

Data summary
Name	airports
Number of rows	1458
Number of columns	8
_______________________
Column type frequency:
character	4
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
faa	0	1	3	3	1458
name	0	1	4	51	1440
dst	0	1	1	1	3
tzone	3	1	14	19	9

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lat	1	41.65	10.45	19.72	34.26	40.09	45.07	72.27	▂▇▆▂▁
lon	1	-103.39	29.84	-176.65	-119.19	-94.66	-82.52	174.11	▅▇▁▁▁
alt	1	1001.42	1523.63	-54.00	70.25	473.00	1062.50	9078.00	▇▁▁▁▁
tz	1	-6.52	1.62	-10.00	-8.00	-6.00	-5.00	8.00	▆▇▁▁▁

airports %>%
  filter ( is.na(tzone)   )

## # A tibble: 3 x 8
##   faa   name                                  lat    lon   alt    tz dst   tzone
##   <chr> <chr>                               <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
## 1 EEN   Dillant Hopkins Airport              72.3   42.9   149    -5 A     <NA> 
## 2 LRO   Mount Pleasant Regional-Faison Fie~  32.5  -79.5    12    -5 A     <NA> 
## 3 YAK   Yakutat                              59.3 -139.     33    -9 A     <NA>

It is mostly complete, we are just missing the timezones for three airports

We mostly have airports in the US, with some in Asia

world <- map_data("world")
ggplot() +
  geom_map(data = world, map = world,aes(x=long,y=lat,map_id = region),color = "black", fill = "lightgray")+
  geom_point(data=airports,mapping=aes(x=lon,y=lat,col=alt))+
  scale_colour_viridis_c(option="C")+
  theme_dark()+labs(title="Location of Airports")

## Warning: Ignoring unknown aesthetics: x, y

We mostly have airports in the US, with some in Asia, though we notice that there is a supposed destination in the middle of the Barents Sea, North of Russia!

Filtered flights with missing longitudes and latitudes

flights %>%
  filter ( !is.na(arr_delay) )%>%
    left_join(airports,by=c("dest"="faa"))%>%
  filter(is.na(lon) & is.na(lat) )%>%
  count(dest)

## # A tibble: 4 x 2
##   dest      n
##   <chr> <int>
## 1 BQN     888
## 2 PSE     358
## 3 SJU    5773
## 4 STT     518

We can manually add in these missing values

flights %>%
  filter ( !is.na(arr_delay) )%>%
    left_join(airports,by=c("dest"="faa")) %>%
  mutate( tzone = factor(tzone)) %>%
  group_by(tzone)%>%
  count()

## # A tibble: 8 x 2
## # Groups:   tzone [8]
##   tzone                    n
##   <fct>                <int>
## 1 America/Anchorage        8
## 2 America/Chicago      72133
## 3 America/Denver       10165
## 4 America/Los_Angeles  45867
## 5 America/New_York    186329
## 6 America/Phoenix       4606
## 7 Pacific/Honolulu       701
## 8 <NA>                  7537

We see most flights are to airports in New York, about 701 end up in the Pacific.

1.3 Weather Data

skim(weather)

Data summary
Name	weather
Number of rows	26115
Number of columns	15
_______________________
Column type frequency:
character	1
numeric	13
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
origin	0	1	3	3	0	3	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.50	3.44	1.00	4.00	7.00	9.00	12.00	▇▆▆▆▇
day	0	1.00	15.68	8.76	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
hour	0	1.00	11.49	6.91	0.00	6.00	11.00	17.00	23.00	▇▇▆▇▇
temp	1	1.00	55.26	17.79	10.94	39.92	55.40	69.98	100.04	▂▇▇▇▁
dewp	1	1.00	41.44	19.39	-9.94	26.06	42.08	57.92	78.08	▁▆▇▇▆
humid	1	1.00	62.53	19.40	12.74	47.05	61.79	78.79	100.00	▁▆▇▇▆
wind_dir	460	0.98	199.76	107.31	0.00	120.00	220.00	290.00	360.00	▆▂▆▇▇
wind_speed	4	1.00	10.52	8.54	0.00	6.90	10.36	13.81	1048.36	▇▁▁▁▁
wind_gust	20778	0.20	25.49	5.95	16.11	20.71	24.17	28.77	66.75	▇▅▁▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	2729	0.90	1017.90	7.42	983.80	1012.90	1017.60	1023.00	1042.10	▁▁▇▆▁
visib	0	1.00	9.26	2.06	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 01:00:00	2013-12-30 18:00:00	2013-07-01 14:00:00	8714

We appear to be missing lots of data wind_gust, so we will remove that.

weather %>%
  filter(!is.na(wind_dir))%>%
  group_by(origin,month)%>%
  summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed),.groups="keep") %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=month,y=avg_wind_dir,col=origin))+
  geom_point()+
  geom_line()+
  labs(title="Average Wind Direction by Month")

weather %>%
  filter(!is.na(wind_dir))%>%
  group_by(origin,month)%>%
  summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed),.groups="keep") %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=month,y=avg_wind_speed,col=origin))+
  geom_point()+
  geom_line()+
    labs(title="Average Wind Speed by Month")

Average wind angle by month

weather %>%
  filter(!is.na(wind_dir))%>%
  mutate(month=factor(month))%>%
  #group_by(origin)%>%
  #summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed)) %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=month,y=wind_dir,fill=origin))+
  geom_boxplot()+
  facet_wrap(.~origin,nrow=3)+
  labs(title="Wind Direction By Month",x="Month",y="Degree (°)")

Average wind speed

weather %>%
  filter(!is.na(wind_dir))%>%
  mutate(month=factor(month))%>%
  #group_by(origin)%>%
  #summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed)) %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=month,y=wind_speed,fill=origin))+
  geom_boxplot()+
  facet_wrap(.~origin,nrow=3)+
  labs(title="Wind Speed by Month")

We notice a clear outlier! There was a wind speed recorded in excess of 1000mph! The highest wind speed ever recorded was 253 mph during severe hurricane conditions. During the time period around which the observation was taken and between the different weather stations, there are no recorded observations that match with the events that this outlier is seemingly trying to suggest.

weather %>%
 #filter(origin=="EWR",month==2,day==12)
filter(month==2,day==12,hour==3)

## # A tibble: 3 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     2    12     3  39.0  27.0  61.6      260     1048.       NA  
## 2 JFK     2013     2    12     3  41    26.1  55.0      290       16.1      26.5
## 3 LGA     2013     2    12     3  42.1  26.1  52.7      290       16.1      28.8
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

  #arrange(desc(wind_speed))

weather %>%
 filter(month==2,day==12,hour %in% c(2:4))

## # A tibble: 9 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     2    12     2  39.9  28.0  62.2      270       20.7      25.3
## 2 EWR     2013     2    12     3  39.0  27.0  61.6      260     1048.       NA  
## 3 EWR     2013     2    12     4  39.0  27.0  64.3      280       12.7      NA  
## 4 JFK     2013     2    12     2  41    28.9  61.9      280       17.3      24.2
## 5 JFK     2013     2    12     3  41    26.1  55.0      290       16.1      26.5
## 6 JFK     2013     2    12     4  39.9  25.0  54.8      280       12.7      21.9
## 7 LGA     2013     2    12     2  43.0  26.1  50.9      290       23.0      31.1
## 8 LGA     2013     2    12     3  42.1  26.1  52.7      290       16.1      28.8
## 9 LGA     2013     2    12     4  41    25.0  52.6      270       13.8      NA  
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

Replace value here

weather %>%
  filter( wind_speed < 1000   )%>%
  mutate(month=factor(month))%>%
  #group_by(origin)%>%
  #summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed)) %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=month,y=wind_speed,fill=origin))+
  geom_boxplot()+
  facet_wrap(.~origin,nrow=3)+
  labs(title="Corrected Wind Speed by Month ")

weather %>%
  filter( wind_speed < 1000   )%>%
  mutate(hour=factor(hour))%>%
  #group_by(origin)%>%
  #summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed)) %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=hour,y=wind_speed,fill=origin))+
  geom_boxplot()+
  facet_wrap(.~origin,nrow=3)+
  labs(title="Wind Speed by Hour of Day")

weather %>%
  filter(origin =="LGA", wind_speed < 1000   )%>%
  mutate(hour=factor(hour),
         month=factor(month))%>%
  #group_by(origin)%>%
  #summarise(avg_wind_dir = mean(wind_dir),avg_wind_speed=mean(wind_speed)) %>%
  left_join(airports,by=c("origin"="faa")) %>%
  ggplot(aes(x=hour,y=wind_speed,fill=origin))+
  geom_boxplot()+
  facet_wrap(month~.,nrow=3)+
  labs(title="Wind Speed by Hour of Month")

1.4 Planes Data

skim(planes)

Data summary
Name	planes
Number of rows	3322
Number of columns	9
_______________________
Column type frequency:
character	5
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
tailnum	1	5	6	3322
type	1	10	24	3
manufacturer	1	4	29	35
model	1	2	18	127
engine	1	7	13	6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	70	0.98	2000.48	7.19	1956	1997.0	2001	2005	2013	▁▁▂▇▇
engines	0	1.00	2.00	0.12	1	2.0	2	2	4	▁▇▁▁▁
seats	0	1.00	154.32	73.65	2	140.0	149	182	450	▂▇▃▁▁
speed	3299	0.01	236.78	149.76	90	107.5	162	432	432	▇▃▁▁▆

We are missing the year of manufacuring for our flights. We could include a variable that denotes where a plane has an unknown year of manufacturing.

planes %>%
  count(type)

## # A tibble: 3 x 2
##   type                         n
##   <chr>                    <int>
## 1 Fixed wing multi engine   3292
## 2 Fixed wing single engine    25
## 3 Rotorcraft                   5

planes %>%
  count(model)%>%
  arrange(desc(n))

## # A tibble: 127 x 2
##    model           n
##    <chr>       <int>
##  1 737-7H4       361
##  2 A320-232      256
##  3 CL-600-2B19   171
##  4 CL-600-2D24   123
##  5 737-824       122
##  6 MD-88         117
##  7 EMB-145LR     114
##  8 737-3H4       105
##  9 EMB-145XR     104
## 10 757-232        94
## # ... with 117 more rows

planes %>%
  count(engine)

## # A tibble: 6 x 2
##   engine            n
##   <chr>         <int>
## 1 4 Cycle           2
## 2 Reciprocating    28
## 3 Turbo-fan      2750
## 4 Turbo-jet       535
## 5 Turbo-prop        2
## 6 Turbo-shaft       5

planes %>%
  mutate(across(where(is.character), as.factor)) %>%
  skim()

Data summary
Name	Piped data
Number of rows	3322
Number of columns	9
_______________________
Column type frequency:
factor	5
numeric	4
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
tailnum	1	FALSE	3322	N10: 1, N10: 1, N10: 1, N10: 1
type	1	FALSE	3	Fix: 3292, Fix: 25, Rot: 5
manufacturer	1	FALSE	35	BOE: 1630, AIR: 400, BOM: 368, AIR: 336
model	1	FALSE	127	737: 361, A32: 256, CL-: 171, CL-: 123
engine	1	FALSE	6	Tur: 2750, Tur: 535, Rec: 28, Tur: 5

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	70	0.98	2000.48	7.19	1956	1997.0	2001	2005	2013	▁▁▂▇▇
engines	0	1.00	2.00	0.12	1	2.0	2	2	4	▁▇▁▁▁
seats	0	1.00	154.32	73.65	2	140.0	149	182	450	▂▇▃▁▁
speed	3299	0.01	236.78	149.76	90	107.5	162	432	432	▇▃▁▁▆

planes %>%
  mutate(across(where(is.character), as.factor)) %>%
  ggplot(aes(x=year,y=seats,col=engine))+
  geom_point(size=3,alpha=.2)+
  labs(title="Manufacturing Year and Seat Count of Planes by Engine Type")

## Warning: Removed 70 rows containing missing values (geom_point).

planes %>%
  count(engines)

## # A tibble: 4 x 2
##   engines     n
##     <int> <int>
## 1       1    27
## 2       2  3288
## 3       3     3
## 4       4     4

2 Creating the Full data set

We first filter out flights that do not have any values for arr_delay

flights_delay = flights %>%
  filter( !is.na(arr_delay)) %>%
  mutate( is_late = factor(ifelse( arr_delay >30,1,0)))%>%
  mutate_if(is.character,as.factor)%>%
  select(-dep_time,-sched_dep_time,-arr_time,-sched_arr_time,-air_time,-arr_delay)

skim(flights_delay)

Data summary
Name	flights_delay
Number of rows	327346
Number of columns	14
_______________________
Column type frequency:
factor	5
numeric	8
POSIXct	1
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
carrier	1	FALSE	16	UA: 57782, B6: 54049, EV: 51108, DL: 47658
tailnum	1	FALSE	4037	N72: 544, N72: 485, N72: 475, N71: 462
origin	1	FALSE	3	EWR: 117127, JFK: 109079, LGA: 101140
dest	1	FALSE	104	ATL: 16837, ORD: 16566, LAX: 16026, BOS: 15022
is_late	1	FALSE	2	0: 275847, 1: 51499

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	1	2013.00	0.00	2013	2013	2013	2013	2013	▁▁▇▁▁
month	1	6.56	3.41	1	4	7	10	12	▇▆▆▆▇
day	1	15.74	8.78	1	8	16	23	31	▇▇▇▇▆
dep_delay	1	12.56	40.07	-43	-5	-2	11	1301	▇▁▁▁▁
flight	1	1943.10	1621.52	1	544	1467	3412	8500	▇▃▃▁▁
distance	1	1048.37	735.91	80	509	888	1389	4983	▇▃▂▁▁
hour	1	13.14	4.66	5	9	13	17	23	▇▆▆▇▃
minute	1	26.23	19.30	0	8	29	44	59	▇▃▆▃▅

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-31 23:00:00	2013-07-04 09:00:00	6922

flights_delay %>%
  count(is_late,name="Count") %>%
  mutate(prop_n=Count/sum(Count))

## # A tibble: 2 x 3
##   is_late  Count prop_n
##   <fct>    <int>  <dbl>
## 1 0       275847  0.843
## 2 1        51499  0.157

We see that we have an unequal class balance with about 84% of flights not arriving late, and 16% of flights arriving later than 30 minutes.

Let’s join with weather.

my_weather = weather %>%
  select(-wind_gust,-year,-month,-day,-hour) %>%
  mutate(wind_speed =  replace(wind_speed, wind_speed>1000, 16.11092))# Replace the outlier with the readings from the other weather stations

skim(my_weather)

Data summary
Name	my_weather
Number of rows	26115
Number of columns	10
_______________________
Column type frequency:
character	1
numeric	8
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
origin	0	1	3	3	0	3	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
temp	1	1.00	55.26	17.79	10.94	39.92	55.40	69.98	100.04	▂▇▇▇▁
dewp	1	1.00	41.44	19.39	-9.94	26.06	42.08	57.92	78.08	▁▆▇▇▆
humid	1	1.00	62.53	19.40	12.74	47.05	61.79	78.79	100.00	▁▆▇▇▆
wind_dir	460	0.98	199.76	107.31	0.00	120.00	220.00	290.00	360.00	▆▂▆▇▇
wind_speed	4	1.00	10.48	5.63	0.00	6.90	10.36	13.81	42.58	▇▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	2729	0.90	1017.90	7.42	983.80	1012.90	1017.60	1023.00	1042.10	▁▁▇▆▁
visib	0	1.00	9.26	2.06	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 01:00:00	2013-12-30 18:00:00	2013-07-01 14:00:00	8714

airports

## # A tibble: 1,458 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
##  2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
##  6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
##  8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
##  9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
## # ... with 1,448 more rows

flights_delay %>%
  count(dest,sort = TRUE)

## # A tibble: 104 x 2
##    dest      n
##    <fct> <int>
##  1 ATL   16837
##  2 ORD   16566
##  3 LAX   16026
##  4 BOS   15022
##  5 MCO   13967
##  6 CLT   13674
##  7 SFO   13173
##  8 FLL   11897
##  9 MIA   11593
## 10 DCA    9111
## # ... with 94 more rows

flights %>%
  filter ( !is.na(arr_delay) )%>%
    left_join(airports,by=c("dest"="faa"))%>%
  filter(is.na(lon) & is.na(lat) )%>%
  count(dest)

## # A tibble: 4 x 2
##   dest      n
##   <chr> <int>
## 1 BQN     888
## 2 PSE     358
## 3 SJU    5773
## 4 STT     518

world <- map_data("world")
ggplot() +
  geom_map(data = world, map = world,aes(x=long,y=lat,map_id = region),color = "black", fill = "lightgray")+
  geom_point(data=airports,mapping=aes(x=lon,y=lat,col=tzone))+
  labs(title="Timezones of Airports")

## Warning: Ignoring unknown aesthetics: x, y

  #scale_colour_viridis_c(option="C")+
  #theme_dark()

We switch the longitudes and latitude values for the EEN airport and add the timezone

airports$lat[airports$faa=="EEN"] = 42.89833    
airports$lon[airports$faa=="EEN"] = -72.27083   

airports$tzone[airports$faa=="EEN"] = "America/New_York"
airports$tzone[airports$faa=="LRO"] = "America/New_York"    
airports$tzone[airports$faa=="YAK"] ="America/Anchorage"

airports %>%
  filter(faa %in% c("EEN","LRO","YAK"))

## # A tibble: 3 x 8
##   faa   name                          lat    lon   alt    tz dst   tzone        
##   <chr> <chr>                       <dbl>  <dbl> <dbl> <dbl> <chr> <chr>        
## 1 EEN   Dillant Hopkins Airport      42.9  -72.3   149    -5 A     America/New_~
## 2 LRO   Mount Pleasant Regional-Fa~  32.5  -79.5    12    -5 A     America/New_~
## 3 YAK   Yakutat                      59.3 -139.     33    -9 A     America/Anch~

Add a long lat and alt for SJU, BWN, STT

new_airports = tibble(faa = c("BQN","PSE","SJU","STT"), 
                      name=NA, 
                      lat=c(18.494833,18.008333,18.439167,18.337222), 
                      lon=c( -67.129500,-66.563056,-66.001944,-64.973333),
                      alt =c(72,9,3,7),
                      tz = NA,
                      tzone="PeurtoRico")



full_airports = airports %>%
  bind_rows(new_airports)

3 Joining all data

Join flights with weather

flights_joined = flights_delay %>%
  inner_join(my_weather, by=c("origin","time_hour"))

skim(flights_joined)

Data summary
Name	flights_joined
Number of rows	325819
Number of columns	22
_______________________
Column type frequency:
character	1
factor	4
numeric	16
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
origin	0	1	3	3	0	3	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
carrier	1	FALSE	16	UA: 57489, B6: 53715, EV: 50868, DL: 47465
tailnum	1	FALSE	4037	N72: 543, N72: 485, N72: 474, N71: 462
dest	1	FALSE	104	ATL: 16771, ORD: 16507, LAX: 15942, BOS: 14948
is_late	1	FALSE	2	0: 274573, 1: 51246

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.55	3.41	1.00	4.00	7.00	9.00	12.00	▇▆▆▆▇
day	0	1.00	15.70	8.75	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
dep_delay	0	1.00	12.55	40.08	-43.00	-5.00	-2.00	11.00	1301.00	▇▁▁▁▁
flight	0	1.00	1943.54	1621.73	1.00	544.00	1471.00	3416.00	8500.00	▇▃▃▁▁
distance	0	1.00	1048.18	735.86	80.00	509.00	888.00	1389.00	4983.00	▇▃▂▁▁
hour	0	1.00	13.13	4.66	5.00	9.00	13.00	17.00	23.00	▇▆▆▇▃
minute	0	1.00	26.23	19.30	0.00	8.00	29.00	44.00	59.00	▇▃▆▃▅
temp	17	1.00	57.01	17.90	10.94	42.08	57.20	71.96	100.04	▁▇▇▇▂
dewp	17	1.00	41.50	19.33	-9.94	26.06	42.80	57.92	78.08	▁▆▇▇▆
humid	17	1.00	59.21	19.57	12.74	43.74	57.22	74.67	100.00	▂▇▇▆▅
wind_dir	8047	0.98	201.94	104.75	0.00	130.00	220.00	290.00	360.00	▆▂▆▇▇
wind_speed	78	1.00	11.06	5.53	0.00	6.90	10.36	14.96	42.58	▆▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	34615	0.89	1017.90	7.42	983.80	1012.90	1017.60	1022.90	1042.10	▁▁▇▆▁
visib	0	1.00	9.29	1.98	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-03 15:00:00	6885

We join the plane data

my_planes= planes %>%
  rename(mnfr_year = year)%>%
  select(-speed)

flights_joined =  flights_joined%>%
  inner_join(my_planes,by="tailnum")

skim(flights_joined)

Data summary
Name	flights_joined
Number of rows	277690
Number of columns	29
_______________________
Column type frequency:
character	6
factor	3
numeric	19
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
tailnum	1	5	6	3316
origin	1	3	3	3
type	1	10	24	3
manufacturer	1	4	29	35
model	1	2	18	127
engine	1	7	13	6

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
carrier	1	FALSE	16	UA: 56489, B6: 52897, EV: 50868, DL: 47356
dest	1	FALSE	104	LAX: 15341, ATL: 14330, BOS: 13524, MCO: 13071
is_late	1	FALSE	2	0: 233629, 1: 44061

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.56	3.40	1.00	4.00	7.00	10.00	12.00	▇▆▆▆▇
day	0	1.00	15.69	8.75	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
dep_delay	0	1.00	13.09	40.44	-43.00	-5.00	-1.00	11.00	1301.00	▇▁▁▁▁
flight	0	1.00	1857.28	1620.68	1.00	507.00	1345.00	3052.00	8500.00	▇▂▂▁▁
distance	0	1.00	1073.90	763.60	80.00	529.00	937.00	1416.00	4983.00	▇▃▂▁▁
hour	0	1.00	13.15	4.70	5.00	9.00	13.00	17.00	23.00	▇▆▆▇▃
minute	0	1.00	26.03	19.33	0.00	6.00	29.00	43.00	59.00	▇▃▆▃▅
temp	16	1.00	57.04	17.89	10.94	42.08	57.20	71.96	100.04	▁▇▇▇▂
dewp	16	1.00	41.63	19.33	-9.94	26.06	42.98	57.92	78.08	▁▆▇▇▆
humid	16	1.00	59.44	19.64	12.74	43.92	57.50	74.98	100.00	▂▇▇▆▅
wind_dir	7031	0.97	201.73	104.86	0.00	130.00	220.00	290.00	360.00	▆▃▆▇▇
wind_speed	73	1.00	10.99	5.54	0.00	6.90	10.36	13.81	42.58	▆▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	29646	0.89	1017.92	7.42	983.80	1012.90	1017.70	1022.90	1042.10	▁▁▇▆▁
visib	0	1.00	9.28	1.99	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
mnfr_year	5137	0.98	2001.40	6.40	1956.00	1999.00	2002.00	2006.00	2013.00	▁▁▁▆▇
engines	0	1.00	1.99	0.09	1.00	2.00	2.00	2.00	4.00	▁▇▁▁▁
seats	0	1.00	137.53	71.73	2.00	55.00	149.00	189.00	450.00	▆▇▆▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-04 08:00:00	6884

And finally, we join the destination data

full_airports = full_airports %>%
  select(-tz,-dst)

flights_joined = flights_joined %>%
  inner_join(full_airports,by=c("dest"="faa"))


skim(flights_joined)

Data summary
Name	flights_joined
Number of rows	277690
Number of columns	34
_______________________
Column type frequency:
character	9
factor	2
numeric	22
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
tailnum	0	1.00	5	6	3316
origin	0	1.00	3	3	3
dest	0	1.00	3	3	104
type	0	1.00	10	24	3
manufacturer	0	1.00	4	29	35
model	0	1.00	2	18	127
engine	0	1.00	7	13	6
name	6096	0.98	6	36	100
tzone	0	1.00	10	19	8

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
carrier	0	1	FALSE	16	UA: 56489, B6: 52897, EV: 50868, DL: 47356
is_late	0	1	FALSE	2	0: 233629, 1: 44061

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.56	3.40	1.00	4.00	7.00	10.00	12.00	▇▆▆▆▇
day	0	1.00	15.69	8.75	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
dep_delay	0	1.00	13.09	40.44	-43.00	-5.00	-1.00	11.00	1301.00	▇▁▁▁▁
flight	0	1.00	1857.28	1620.68	1.00	507.00	1345.00	3052.00	8500.00	▇▂▂▁▁
distance	0	1.00	1073.90	763.60	80.00	529.00	937.00	1416.00	4983.00	▇▃▂▁▁
hour	0	1.00	13.15	4.70	5.00	9.00	13.00	17.00	23.00	▇▆▆▇▃
minute	0	1.00	26.03	19.33	0.00	6.00	29.00	43.00	59.00	▇▃▆▃▅
temp	16	1.00	57.04	17.89	10.94	42.08	57.20	71.96	100.04	▁▇▇▇▂
dewp	16	1.00	41.63	19.33	-9.94	26.06	42.98	57.92	78.08	▁▆▇▇▆
humid	16	1.00	59.44	19.64	12.74	43.92	57.50	74.98	100.00	▂▇▇▆▅
wind_dir	7031	0.97	201.73	104.86	0.00	130.00	220.00	290.00	360.00	▆▃▆▇▇
wind_speed	73	1.00	10.99	5.54	0.00	6.90	10.36	13.81	42.58	▆▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	29646	0.89	1017.92	7.42	983.80	1012.90	1017.70	1022.90	1042.10	▁▁▇▆▁
visib	0	1.00	9.28	1.99	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
mnfr_year	5137	0.98	2001.40	6.40	1956.00	1999.00	2002.00	2006.00	2013.00	▁▁▁▆▇
engines	0	1.00	1.99	0.09	1.00	2.00	2.00	2.00	4.00	▁▇▁▁▁
seats	0	1.00	137.53	71.73	2.00	55.00	149.00	189.00	450.00	▆▇▆▁▁
lat	0	1.00	35.63	6.25	18.01	30.49	36.08	41.41	61.17	▂▆▇▁▁
lon	0	1.00	-89.59	15.89	-157.92	-95.34	-83.35	-80.15	-64.97	▁▁▂▅▇
alt	0	1.00	578.67	986.83	3.00	26.00	285.00	748.00	6602.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-04 08:00:00	6884

flights_joined = flights_joined %>%
  mutate_if(is.character,as.factor)%>%
  mutate(engines = factor(engines))%>%
  rename(engine_type = engine)

skim(flights_joined)

Data summary
Name	flights_joined
Number of rows	277690
Number of columns	34
_______________________
Column type frequency:
factor	12
numeric	21
POSIXct	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
carrier	0	1.00	FALSE	16	UA: 56489, B6: 52897, EV: 50868, DL: 47356
tailnum	0	1.00	FALSE	3316	N71: 462, N25: 420, N35: 402, N29: 400
origin	0	1.00	FALSE	3	EWR: 111464, JFK: 92834, LGA: 73392
dest	0	1.00	FALSE	104	LAX: 15341, ATL: 14330, BOS: 13524, MCO: 13071
is_late	0	1.00	FALSE	2	0: 233629, 1: 44061
type	0	1.00	FALSE	3	Fix: 275679, Fix: 1611, Rot: 400
manufacturer	0	1.00	FALSE	35	BOE: 81896, EMB: 63210, AIR: 46624, AIR: 40477
model	0	1.00	FALSE	127	A32: 45159, EMB: 26353, ERJ: 23359, 737: 13703
engines	0	1.00	FALSE	4	2: 275618, 1: 1932, 4: 133, 3: 7
engine_type	0	1.00	FALSE	6	Tur: 234949, Tur: 40552, Rec: 1696, Tur: 400
name	6096	0.98	FALSE	100	Los: 15341, Har: 14330, Gen: 13524, Orl: 13071
tzone	0	1.00	FALSE	8	Ame: 157897, Ame: 55498, Ame: 43108, Ame: 9822

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.56	3.40	1.00	4.00	7.00	10.00	12.00	▇▆▆▆▇
day	0	1.00	15.69	8.75	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
dep_delay	0	1.00	13.09	40.44	-43.00	-5.00	-1.00	11.00	1301.00	▇▁▁▁▁
flight	0	1.00	1857.28	1620.68	1.00	507.00	1345.00	3052.00	8500.00	▇▂▂▁▁
distance	0	1.00	1073.90	763.60	80.00	529.00	937.00	1416.00	4983.00	▇▃▂▁▁
hour	0	1.00	13.15	4.70	5.00	9.00	13.00	17.00	23.00	▇▆▆▇▃
minute	0	1.00	26.03	19.33	0.00	6.00	29.00	43.00	59.00	▇▃▆▃▅
temp	16	1.00	57.04	17.89	10.94	42.08	57.20	71.96	100.04	▁▇▇▇▂
dewp	16	1.00	41.63	19.33	-9.94	26.06	42.98	57.92	78.08	▁▆▇▇▆
humid	16	1.00	59.44	19.64	12.74	43.92	57.50	74.98	100.00	▂▇▇▆▅
wind_dir	7031	0.97	201.73	104.86	0.00	130.00	220.00	290.00	360.00	▆▃▆▇▇
wind_speed	73	1.00	10.99	5.54	0.00	6.90	10.36	13.81	42.58	▆▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	29646	0.89	1017.92	7.42	983.80	1012.90	1017.70	1022.90	1042.10	▁▁▇▆▁
visib	0	1.00	9.28	1.99	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
mnfr_year	5137	0.98	2001.40	6.40	1956.00	1999.00	2002.00	2006.00	2013.00	▁▁▁▆▇
seats	0	1.00	137.53	71.73	2.00	55.00	149.00	189.00	450.00	▆▇▆▁▁
lat	0	1.00	35.63	6.25	18.01	30.49	36.08	41.41	61.17	▂▆▇▁▁
lon	0	1.00	-89.59	15.89	-157.92	-95.34	-83.35	-80.15	-64.97	▁▁▂▅▇
alt	0	1.00	578.67	986.83	3.00	26.00	285.00	748.00	6602.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-04 08:00:00	6884

Next step. Perform EDA and find what variable contribute to a flight arriving late.

4 EDA

flights_joined %>%
  group_by(is_late) %>%
  count()%>%
  ungroup()%>%
  mutate( prop = n/sum(n)) #%>% pull()

## # A tibble: 2 x 3
##   is_late      n  prop
##   <fct>    <int> <dbl>
## 1 0       233629 0.841
## 2 1        44061 0.159

We see that about 15.9% of all flights are classes as being late.

We will drop the manufacturer and name

flights_joined %>%
  count(manufacturer,sort = TRUE)

## # A tibble: 35 x 2
##    manufacturer                      n
##    <fct>                         <int>
##  1 BOEING                        81896
##  2 EMBRAER                       63210
##  3 AIRBUS                        46624
##  4 AIRBUS INDUSTRIE              40477
##  5 BOMBARDIER INC                27319
##  6 MCDONNELL DOUGLAS AIRCRAFT CO  8787
##  7 MCDONNELL DOUGLAS              3831
##  8 CANADAIR                       1489
##  9 MCDONNELL DOUGLAS CORPORATION  1242
## 10 CESSNA                          619
## # ... with 25 more rows

Is there any association between carriers and a plane being late?

flights_joined %>%  
  mutate(carrier =  fct_reorder(.f=carrier,
                                .x=is_late,
                                .fun=function(.x) mean(.x=="1"),
                                .desc=FALSE)) %>%
  ggplot(aes(x=carrier,fill=is_late))+
  geom_bar(position="fill",alpha=0.8)+
  geom_hline(yintercept = 0.1586697,lty=2,size=1.5,col="black")+
  annotate(geom="label",x=4, y=0.25,label="Average Proportion",size=6)+
  labs(title="Proportion of Flights by Carrier Classed by Lateness",y="Proportion of Flights",fill="Is Late?")

We see that Hawaiian Airlines does the best by having the fewest proportion of flights being late, and Frontier Airlines does the worst by having the highest proportions of it’s flight as late.

Let’s see if flights are more likely to be late depending on the origin

flights_joined %>%
  #filter(is_late==1)%>%
  group_by(origin,is_late)%>%
  count()%>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  filter(is_late==1)%>%
  ggplot(aes(x=origin,y=prop,fill=origin))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),vjust=-0.25)+
  labs(title="Percentage of Flights that are Late by Origin",x="Origin",y=NULL)+
  scale_y_continuous(labels = label_percent())

We see that flights departing from EWR tend to have a higher chance of arriving late.

flights_joined %>%
  group_by(origin)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(origin,n,prop)

## # A tibble: 3 x 3
##   origin     n  prop
##   <fct>  <int> <dbl>
## 1 EWR    19328 0.173
## 2 JFK    14054 0.151
## 3 LGA    10679 0.146

Let’s look that the type of aircraft

flights_joined %>%
  group_by(type)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(type,n,prop)

## # A tibble: 3 x 3
##   type                         n  prop
##   <fct>                    <int> <dbl>
## 1 Fixed wing multi engine  43753 0.159
## 2 Fixed wing single engine   246 0.153
## 3 Rotorcraft                  62 0.155

It appears that engine type does not affect whether the flight will arrive late or not.

flights_joined %>%
  #filter(is_late==1)%>%
  group_by(engines,is_late)%>%
  count()%>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  filter(is_late==1)%>%
  ggplot(aes(x=engines,y=prop,fill=engines))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),vjust=-0.25)+
  labs(title="Percentage of Flights that are Late by Number of Engines",x="Engine",y=NULL,fill=NULL)+
  scale_y_continuous(labels = label_percent())

flights_joined %>%
  count(engine_type) %>%
  ggplot(aes(y=   fct_reorder(engine_type,n),x=n,fill=engine_type ))+
  geom_col(show.legend = T)+
  geom_text(aes(label=n),hjust=-.1)+
  labs(title="Number of Engine Type",y="Engine Type",x="Count")+
  xlim(0,250000)

flights_joined %>%
  group_by(engine_type)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(engine_type,n,prop)%>%
  ggplot(aes(x=engine_type,y=prop,fill=engine_type))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),vjust=-0.25)+
  labs(title="Percentage of Flights that are Late by Engine Type",x="Engine Type",y=NULL,fill=NULL)+
  scale_y_continuous(labels = label_percent())

flights_joined %>%
  mutate(counted =     fct_lump_prop(engine_type,prop=0.01))%>%
  count(counted)%>%
  ggplot(aes(y= fct_reorder(counted,n),x=n,fill=counted))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=n),hjust=-.1)+
  xlim(0,250000)+
  labs(title="Count of Engine Type",subtitle="Types that appeared less than 1% of all types were filled to other",x="Count",y="Engine Type")

flights_joined %>%
  mutate(counted =  fct_lump_prop(engine_type,prop=0.01))%>%
  group_by(counted)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(counted,n,prop)%>%
  ggplot(aes(x=counted,y=prop,fill=counted))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),vjust=-0.25)+
  labs(title="Percentage of Flights that are Late by Engine Type",x="Engine Type",y=NULL,fill=NULL)+
  scale_y_continuous(labels = label_percent())

flights_joined %>%
  #mutate(counted =     fct_lump_prop(engine_type,prop=0.01))%>%
  mutate(counted = tzone)%>%
  count(counted)%>%
  ggplot(aes(y= fct_reorder(counted,n),x=n,fill=counted))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=n),hjust=-.1)+
  xlim(0,175000)+
  labs(title="Count of flights to Timezones",x="Count",y="Timezone")

It appears that it is important to not factor lump the variables for tzone, since there is an affect on them.

flights_joined %>%
  mutate(counted =  fct_lump_prop(tzone,prop=0.01))%>%
  group_by(counted)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(counted,n,prop)%>%
  ggplot(aes(y=counted,x=prop,fill=counted))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),hjust=-0.1)+
  labs(title="Percentage of Flights that are Late by Engine Type",y="Engine Type",x=NULL,fill=NULL)+
  scale_x_continuous(labels = label_percent(),limits=c(0,0.19))

flights_joined %>%
  #mutate(counted =  fct_lump_prop(tzone,prop=0.01))%>%
  mutate(counted = tzone) %>%
  group_by(counted)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%
  select(counted,n,prop)%>%
  ggplot(aes(y=counted,x=prop,fill=counted))+
  geom_col(show.legend = FALSE)+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%")),hjust=-0.1)+
  labs(title="Percentage of Flights that are Late by Timezone Destination",y="Timezone Destination",x=NULL,fill=NULL)+
  scale_x_continuous(labels = label_percent(),limits=c(0,0.19))

ggplot(flights_joined,aes(x=lat,fill=is_late))+
  geom_density(alpha=0.4)+ 
  #geom_histogram()+
 labs(title="Density of Flights Being On Time or Late, by Latitude",x="Latitiude",y="Density",fill="Is Late")

ggplot(flights_joined,aes(x=lon,fill=is_late))+
  geom_density(alpha=0.4)+
   labs(title="Density of Flights Being On Time or Late, by Longitude",x="Longitude",y="Density",fill="Is Late")

ggplot(flights_joined,aes(x=alt,fill=is_late))+
  geom_density(alpha=0.4)+
   labs(title="Density of Flights Being On Time or Late, by Altitude",x="Altitude",y="Density",fill="Is Late")

There doesn’t seem to be a relationship between the location and if a flight is late or not.

Next look at the time, hour, minute, hour, day, month. Heatmap of lates over the year? Find any that there abnormally high?

p1 <- flights_joined %>% 
  count(minute) %>%
  ggplot(aes(x=minute,y=n))+
  geom_point()+
  geom_line()+
  labs(title="Number of Flights for Minute of Hour",x="Minute of Hour",y="Count")+
  scale_y_continuous(labels = scales::comma)


p2 <- flights_joined %>% 
  group_by(minute)%>%
  count(is_late) %>%
  mutate(prop = n/sum(n)) %>%
  filter(is_late==1)%>%
  ggplot(aes(x=minute,y=prop))+
  geom_point(col="steelblue4")+
  geom_line(col="steelblue4")+
  labs(title="Percentage of Flights that are Late by Minute of Hour",x="Minute of Hour",y="Percentage")+
  #geom_smooth(col="red", se=FALSE)+
      scale_y_continuous(labels=label_percent(),limits=c(0,0.25))

p1 / p2

p1 <-flights_joined %>% 
  count(hour) %>%
  ggplot(aes(x=hour,y=n))+
  geom_point()+
  geom_line()+
  labs(title="Number of Flights By Hour of Day",x="Hour",y="Count")+
  scale_y_continuous(labels = scales::comma)


p2 <- flights_joined %>% 
  group_by(hour)%>%
  count(is_late) %>%
  mutate(prop = n/sum(n)) %>%
  filter(is_late==1)%>%
  ggplot(aes(x=hour,y=prop))+
  geom_point(col="aquamarine3")+
  geom_line(col="aquamarine3")+
  labs(title="Percenetage of Flights that are Late by Hour of Day",x="Hour of Day",y="Percentage")+
    scale_y_continuous(labels=label_percent(),limits=c(0,0.3))

p1 / p2

It appears that the later the flight in a day, the more likely it is more the plane will be late.

p1 <-flights_joined %>% 
  mutate(mymonth = month(date(time_hour),label=TRUE) )%>%
  group_by(mymonth)%>%
  count() %>%
  ggplot(aes(x=mymonth,y=n,group=1))+
  geom_point()+
  geom_line()+
  labs(title="Number of Flights By Month",x="Month",y="Count")+
  scale_y_continuous(labels = scales::comma)

p2 <- flights_joined %>% 
  #mutate(mymonth = month(date(time_hour)) )%>%
  mutate(mymonth = month(date(time_hour),label=TRUE) )%>%
  group_by(mymonth)%>%
  count(is_late) %>%
  mutate(prop = n/sum(n)) %>%
  filter(is_late==1)%>%
  ungroup()%>%
  ggplot(aes(x=mymonth,y=prop,group=1))+
  geom_point(col="orangered1")+
  geom_line(col="orangered1")+
  labs(title="Percentage of Flights that are Late by Month",x="Month",y="Percentage")+
  scale_y_continuous(labels=label_percent())

p1 / p2

February generally sees fewer flights, but this may also be due to the fact it is the shortest month of the year.

We see that some months have different proportions of flights being late. Most notably, June and July have 25% of their flights arriving late, while during the months of: September, October, and November all have less than 10% of their flights being late.

flights_joined %>%
  group_by(month,day)%>%
  count()%>%
  ggplot(aes(x=day,y=n))+
  geom_point()+
  geom_line()+
  facet_wrap(.~month)+
  labs(title="Total Number of Flights by Day and Month",x="Day",y="Count")

We can observe the number of flights across the year. We see that on weekdays, there are around 800 flights a day, while on Sunday, there are about 600 flights a day. We also observe a sharp decrease in the number of flights during the 8th-9th February, further research shows that this was the day where New York was struck by a large blizzard, thereby severely restricting the number of flights.

We also see a regular dip every seven days, this is when the flights are on a Sunday.

flights_joined %>%
  mutate(mymonth = month(date(time_hour),label=TRUE,abbr = FALSE)) %>%
  group_by(mymonth,day)%>%
  count(is_late) %>%
  mutate(prop = n/sum(n))%>%
  filter(is_late==1)%>%
  ungroup()%>%
  ggplot( aes(x=day,y=prop))+
  geom_point()+
  geom_line()+
  facet_wrap(.~mymonth)+
  theme(strip.background = element_blank(),
        panel.border = element_blank())+
  theme_bw()+
  labs(title="Proportion of Late Flights By Day of Month")

We see a noticeable day where the proprotion of flights being late was quiet late. This occurred on the 8th March and searching the web shows us that snow fell on New York on that day.

p1 <- flights_joined %>% 
  ggplot(aes(time_hour)) + 
  geom_freqpoly(binwidth = 86400)+
  #scale_x_datetime(date_breaks="1 month",date_labels = "%b%y",minor_breaks=NULL)
  scale_x_datetime(date_breaks = "month", labels = label_date_short(),minor_breaks=NULL)+
    labs(title="Number of Flights by  Day of Year",x="Date",y="Flights Delayed")

p2 <- flights_joined %>% 
  mutate (  mydate = date(time_hour) )%>%
  group_by(mydate)%>%
  count(is_late) %>%
  mutate(prop = n/sum(n))%>%
  filter(is_late==1)%>%
  ggplot(aes(x=mydate,y=prop)) + 
  geom_line()+
  #geom_smooth(col="red3",method="loess",formula =  y~x)+
  scale_x_date(date_breaks = "month", labels = label_date_short(),minor_breaks=NULL)+
  scale_y_continuous(labels = label_percent())+
  labs(title="Percent of Flights Delayed by Day of Year",x="Date",y="Flights Delayed")
  
p1/p2

We will now look at weather

temp dewp humid wind_dir wind_speed precip pressure visib

p1 <- flights_joined %>%
  ggplot(aes(x=temp,fill=is_late))+
  geom_density(alpha=0.5)

p2 <- flights_joined %>%
  ggplot(aes(x=dewp,fill=is_late))+
  geom_density(alpha=0.5)

p3 <- flights_joined %>%
  ggplot(aes(x=humid,fill=is_late))+
  geom_density(alpha=0.5)


p4 <- flights_joined %>%
  mutate(rain = case_when(precip > 0  & precip < 0.25   ~ "Light_Precip",
                          precip >= 0.25 ~ "Heavy_Precip",
                          TRUE  ~ "No_Precip"
                          )) %>%
  ggplot(aes(x=rain,fill=is_late))+
  geom_bar(position = position_fill(),show.legend = FALSE)


p1 + p2 + p3 + p4 +  plot_layout(ncol=2,guides = 'collect')+ 
  plot_annotation(title = "Weather Effects on Flight Punctuality")

## Warning: Removed 16 rows containing non-finite values (stat_density).

## Warning: Removed 16 rows containing non-finite values (stat_density).

## Warning: Removed 16 rows containing non-finite values (stat_density).

We see that when the temperatures are high, there are more instances of planes being late.

A higher dew point means the more water is in the atmosphere, which means it’s harder for the human body to evaporate heat. A higher humidity also leads to there being more

p1<-flights_joined %>%
  ggplot(aes(y=wind_speed,x=is_late,fill=is_late))+
 # geom_density(alpha=0.5)
  geom_violin(draw_quantiles= c(0.25,0.5,0.75))

p2 <-flights_joined %>%
  ggplot(aes(x=wind_dir,fill=is_late))+
  geom_density(alpha=0.5)

p1 + p2 + plot_layout(guides="collect")+plot_annotation(title = "Wind Speed and Wind Direction")

## Warning: Removed 73 rows containing non-finite values (stat_ydensity).

## Warning: Removed 7031 rows containing non-finite values (stat_density).

flights_joined %>%
  #filter(visib <10) %>%
  ggplot(aes(x=visib,fill=is_late))+
  geom_density(alpha=0.5)+
  labs(title="Visibility Effects on Lateness")

We could have two different weather patterns? Good visibility and poor visibility on other days.

 cor(flights_joined[c("temp","dewp","humid","precip","visib")],use = "complete.obs") %>%
  corrplot(addCoef.col = "black", number.digits = 2,title = " Weather Correlation Plot", mar = c(0,0,4,0))

#flights_joined[c("temp","dewp","humid","precip","visib","is_late")] %>%
#ggpairs(mapping = aes(color = is_late),columns=c("temp","dewp","humid","precip","visib"))

We will remove dewp since it is highly correlated with temp.

 cor(flights_joined[c("temp","dewp","humid","precip","visib","month","hour","lat","lon","alt")],use = "complete.obs") %>%
  corrplot(addCoef.col = "black", number.digits = 2,title = " Weather, Time, and Location Correlation Plot", mar = c(0,0,2,0))

We don’t see any linear correlation between these variables such as, weather effects and time or location. We will still drop longitude, latitude, and altitude.

We look at engine type

flights_joined %>%
  count(engines) %>%
  mutate(prop = n/sum(n)*100 )

## # A tibble: 4 x 3
##   engines      n     prop
##   <fct>    <int>    <dbl>
## 1 1         1932  0.696  
## 2 2       275618 99.3    
## 3 3            7  0.00252
## 4 4          133  0.0479

flights_joined %>%
  mutate(  engines =   fct_lump_prop(engines,prop=0.01) )%>%
  group_by(engines) %>%
   count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)

## # A tibble: 2 x 4
##   engines is_late     n  prop
##   <fct>   <fct>   <int> <dbl>
## 1 2       1       43743 0.159
## 2 Other   1         318 0.153

We see that engine size does not make a difference if a plane is late or not

flights_joined %>%
  ggplot(aes(x=seats,fill=is_late))+
  geom_density(alpha=0.5)

We see that for flights with over 100 seats or fewer than 25, these flights tend to more likely be on time than flights with around 25 to 100 seats.

flights_joined %>%
   mutate(   mnfr_year =  as.factor(mnfr_year %/% 5 * 5) ) %>%
   mutate( mnfr_year  = fct_lump_prop(as.factor(mnfr_year), prop=0.01)) %>%
  count(mnfr_year)%>%
  ggplot(aes(x=mnfr_year,y=n))+
  geom_hline(yintercept = 2777,lty=2,col="red")+ # 1% line
  geom_col()+
  labs(title="Count of Engines Manufactured by Year")

flights_joined %>%
  #mutate( mnfr_year = as.factor(mnfr_year)) %>%
  # mutate( mnfr_year  = fct_lump_prop(as.factor(mnfr_year), prop=0.01)) %>%
    mutate(   mnfr_year =  as.factor((mnfr_year %/% 5) * 5) ) %>%
  mutate( mnfr_year  = fct_lump_prop(as.factor(mnfr_year), prop=0.01)) %>%
  group_by(mnfr_year)%>%
  count(is_late) %>%
  ungroup(is_late)%>%
  mutate(prop=n/sum(n))%>%
  ungroup()%>%
  filter(is_late==1)%>%

ggplot(aes(x=mnfr_year,y=prop))+
  geom_col()+
  geom_text(aes(label=  paste0(signif(prop*100,3),"%"), vjust=-.1  ))+
  scale_y_continuous(labels = label_percent() )+
  geom_hline(yintercept = 0.159,col="red",lty=2)+
  labs(title="Percent of Engine Manufacturing Being Late by Year")

flights_joined %>%
  ggplot(aes(x=dep_delay,fill=is_late))+
  geom_density(alpha=0.5)

summary(flights_joined$type)

##  Fixed wing multi engine Fixed wing single engine               Rotorcraft 
##                   275679                     1611                      400

5 Final model set

We will finally create the dataset for modelling, as well as any changes to be made.

carrier will be kept
tailnum will be used as an id
origin will be kept
dest will be kept as id
engine type will be have it’s factors lumped
manufacturer will be kept as id
model will be used as id
engines will be removed
engine_type will have it’s factors lumped
name will be removed
tzone will have it’s factors lumped
year, month, day, hour,minute will be dropped.
distance will be dropped.
temp , humid, precip and visib will be included.
mnfr_year will be binned into 5 year bins, and then factor lumped.
type will be dropped.

skim(flights_joined)

Data summary
Name	flights_joined
Number of rows	277690
Number of columns	34
_______________________
Column type frequency:
factor	12
numeric	21
POSIXct	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
carrier	0	1.00	FALSE	16	UA: 56489, B6: 52897, EV: 50868, DL: 47356
tailnum	0	1.00	FALSE	3316	N71: 462, N25: 420, N35: 402, N29: 400
origin	0	1.00	FALSE	3	EWR: 111464, JFK: 92834, LGA: 73392
dest	0	1.00	FALSE	104	LAX: 15341, ATL: 14330, BOS: 13524, MCO: 13071
is_late	0	1.00	FALSE	2	0: 233629, 1: 44061
type	0	1.00	FALSE	3	Fix: 275679, Fix: 1611, Rot: 400
manufacturer	0	1.00	FALSE	35	BOE: 81896, EMB: 63210, AIR: 46624, AIR: 40477
model	0	1.00	FALSE	127	A32: 45159, EMB: 26353, ERJ: 23359, 737: 13703
engines	0	1.00	FALSE	4	2: 275618, 1: 1932, 4: 133, 3: 7
engine_type	0	1.00	FALSE	6	Tur: 234949, Tur: 40552, Rec: 1696, Tur: 400
name	6096	0.98	FALSE	100	Los: 15341, Har: 14330, Gen: 13524, Orl: 13071
tzone	0	1.00	FALSE	8	Ame: 157897, Ame: 55498, Ame: 43108, Ame: 9822

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013.00	2013.00	2013.00	2013.00	2013.00	▁▁▇▁▁
month	0	1.00	6.56	3.40	1.00	4.00	7.00	10.00	12.00	▇▆▆▆▇
day	0	1.00	15.69	8.75	1.00	8.00	16.00	23.00	31.00	▇▇▇▇▆
dep_delay	0	1.00	13.09	40.44	-43.00	-5.00	-1.00	11.00	1301.00	▇▁▁▁▁
flight	0	1.00	1857.28	1620.68	1.00	507.00	1345.00	3052.00	8500.00	▇▂▂▁▁
distance	0	1.00	1073.90	763.60	80.00	529.00	937.00	1416.00	4983.00	▇▃▂▁▁
hour	0	1.00	13.15	4.70	5.00	9.00	13.00	17.00	23.00	▇▆▆▇▃
minute	0	1.00	26.03	19.33	0.00	6.00	29.00	43.00	59.00	▇▃▆▃▅
temp	16	1.00	57.04	17.89	10.94	42.08	57.20	71.96	100.04	▁▇▇▇▂
dewp	16	1.00	41.63	19.33	-9.94	26.06	42.98	57.92	78.08	▁▆▇▇▆
humid	16	1.00	59.44	19.64	12.74	43.92	57.50	74.98	100.00	▂▇▇▆▅
wind_dir	7031	0.97	201.73	104.86	0.00	130.00	220.00	290.00	360.00	▆▃▆▇▇
wind_speed	73	1.00	10.99	5.54	0.00	6.90	10.36	13.81	42.58	▆▇▂▁▁
precip	0	1.00	0.00	0.03	0.00	0.00	0.00	0.00	1.21	▇▁▁▁▁
pressure	29646	0.89	1017.92	7.42	983.80	1012.90	1017.70	1022.90	1042.10	▁▁▇▆▁
visib	0	1.00	9.28	1.99	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
mnfr_year	5137	0.98	2001.40	6.40	1956.00	1999.00	2002.00	2006.00	2013.00	▁▁▁▆▇
seats	0	1.00	137.53	71.73	2.00	55.00	149.00	189.00	450.00	▆▇▆▁▁
lat	0	1.00	35.63	6.25	18.01	30.49	36.08	41.41	61.17	▂▆▇▁▁
lon	0	1.00	-89.59	15.89	-157.92	-95.34	-83.35	-80.15	-64.97	▁▁▂▅▇
alt	0	1.00	578.67	986.83	3.00	26.00	285.00	748.00	6602.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-04 08:00:00	6884

full_flights = 
flights_joined %>%
  na.omit() %>%
  select(-year,-month,-day,-hour,-minute,-type,-manufacturer,-model,-engines,-distance,-dewp,-name,-pressure,-dest,-wind_dir,-wind_speed,-humid) %>%
  mutate(   mnfr_year =  as.factor( (mnfr_year %/% 5) * 5),
            mnfr_year  = fct_lump_prop(as.factor(mnfr_year), prop=0.01 ),
            engine_type = fct_lump_prop(engine_type, prop=0.01),
            tzone = fct_lump_prop(tzone, prop=0.01),
            dep_date = as.Date(time_hour)
            )

saveRDS(full_flights,file="Cleaned_Full_Flights.rds")

skim(full_flights)

Data summary
Name	full_flights
Number of rows	232412
Number of columns	19
_______________________
Column type frequency:
Date	1
factor	7
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
dep_date	0	1	2013-01-01	2013-12-30	2013-07-06	364

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
carrier	1	FALSE	16	UA: 46705, B6: 43035, EV: 42735, DL: 39866
tailnum	1	FALSE	3224	N71: 404, N25: 371, N35: 353, N35: 352
origin	1	FALSE	3	EWR: 91968, JFK: 77973, LGA: 62471
is_late	1	FALSE	2	0: 199198, 1: 33214
mnfr_year	1	FALSE	7	200: 85189, 200: 56473, 199: 38295, 199: 21498
engine_type	1	FALSE	3	Tur: 197038, Tur: 33878, Oth: 1496
tzone	1	FALSE	6	Ame: 134986, Ame: 47286, Ame: 37241, Ame: 8455

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dep_delay	1	11.62	37.99	-43.00	-5.00	-2.00	10.00	1301.00	▇▁▁▁▁
flight	1	1872.10	1625.73	1.00	505.00	1394.00	3305.00	6181.00	▇▅▁▃▁
temp	1	56.99	18.06	10.94	42.08	57.02	71.96	100.04	▁▇▇▇▂
humid	1	56.33	18.08	13.00	42.48	54.51	69.53	100.00	▂▇▇▆▂
precip	1	0.00	0.01	0.00	0.00	0.00	0.00	0.53	▇▁▁▁▁
visib	1	9.58	1.46	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
seats	1	136.82	71.96	2.00	55.00	149.00	189.00	400.00	▆▅▇▁▁
lat	1	36.02	5.76	21.32	32.73	36.10	41.41	61.17	▃▆▇▁▁
lon	1	-90.15	15.70	-157.92	-95.34	-84.22	-80.15	-68.83	▁▁▂▃▇
alt	1	590.66	995.14	3.00	26.00	313.00	748.00	6602.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-30 18:00:00	2013-07-06 18:00:00	6344

With this we finally have a dataset that is ready to be modelled on. The other file will continue with modelling.

NYC Flights - Initial Cleaning and Investigation

1 Data

1.1 Flights Data

1.2 Airports Data

1.3 Weather Data

1.4 Planes Data

2 Creating the Full data set

3 Joining all data

4 EDA

5 Final model set