library(tidyverse)
library(lubridate)
library(pander) # for prettier tables
library(scales) # for making prettier axes in plots
library(stringr)
## added
##########################
library(plotly)
library(devtools)
library(maps)
library(corrgram)
library(corrplot)
library(knitr)
library(kableExtra)
library(caTools)
library(modelr)
library(GGally)
library(viridis)
library(factoextra)
##########################
theme_set(theme_bw())
panderOptions('big.mark', ',')
Deadline for hand-in: Jan 3, 2018 at 23:55.
Where: Moodle.
What: Rmd file. Possibly also pdf (or html), but Rmd is mandatory.
Groups: Maximum 3 participants, however the project must be handed in individually.
Here, we focus on the airlines, airports, flights, planes, and weather datasets:
## Warning: package 'nycflights13' was built under R version 3.4.3
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
## # A tibble: 1,458 x 8
## faa name lat lon alt tz
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -6
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6
## 4 06N Randall Airport 41.43191 -74.39156 523 -5
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -5
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -5
## 7 0G6 Williams County Airport 41.46731 -84.50678 730 -5
## 8 0G7 Finger Lakes Regional Airport 42.88356 -76.78123 492 -5
## 9 0P2 Shoestring Aviation Airfield 39.79482 -76.64719 1000 -5
## 10 0S9 Jefferson County Intl 48.05381 -122.81064 108 -8
## # ... with 1,448 more rows, and 2 more variables: dst <chr>, tzone <chr>
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model
## <chr> <int> <chr> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 7 N107US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 8 N108UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 9 N109UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 10 N110UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## # ... with 3,312 more rows, and 4 more variables: engines <int>,
## # seats <int>, speed <int>, engine <chr>
## # A tibble: 26,130 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## 7 EWR 2013 1 1 7 39.02 26.96 61.63 250 8.05546
## 8 EWR 2013 1 1 8 39.02 28.04 64.43 240 11.50780
## 9 EWR 2013 1 1 9 39.92 28.04 62.21 250 12.65858
## 10 EWR 2013 1 1 10 39.02 28.04 64.43 260 12.65858
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
Remember to read about the datasets.
Construct a barplot displaying number of flights per month.
# Here, we convert these variables to the categorical ones.
flights$month <- as.factor(flights$month)
flights$origin <- as.factor(flights$origin)
flights$dest <- as.factor(flights$dest)
planes$manufacturer <-as.factor(planes$manufacturer)
# Vector Name of months
v_months <- c("1" = "Jan", "2" = "Feb","3" = "Mar",
"4" = "Apr", "5" = "May", "6" = "Jun",
"7" = "Jul", "8" = "Aug","9" = "sep",
"10" = "Oct","11" = "Nov", "12" = "Dec")
Find the number of flights per month.
flights%>%
group_by(month)%>% na.omit()%>%
summarise(n_of_flight =n())%>%
pander(big.mark = ',', justify = c('left', 'right'))
| month | n_of_flight |
|---|---|
| 1 | 26,398 |
| 2 | 23,611 |
| 3 | 27,902 |
| 4 | 27,564 |
| 5 | 28,128 |
| 6 | 27,075 |
| 7 | 28,293 |
| 8 | 28,756 |
| 9 | 27,010 |
| 10 | 28,618 |
| 11 | 26,971 |
| 12 | 27,020 |
Now we can construct a barplot displaying number of flights per month.
nfly_per_month <- flights%>%
group_by(year, month)%>%
ggplot(aes(x = month))+
geom_bar() +
labs(x = "Months",
y = "Number of flights",
title = "Number of flights per month") +
scale_x_discrete(labels = v_months)
nfly_per_month
# ggplotly(nfly_per_month) # to makee it interactive.
Now, in the barplot (showing number of flights per month), make a separate bar for each origin.
origin_flights <- flights%>%
ggplot(aes( x = month))+
geom_bar(aes(fill = origin), position = "dodge") +
labs(x = "Months",
y = "Number of flights",
title = "flights for each origin per month") +
scale_x_discrete(labels = v_months)
#facet_wrap(~ origin)
# ggplotly(origin_flights) # make it interactive
origin_flights
List of the number of flights for each origin per month
months_list <- flights%>%
group_by(month, origin)%>%
summarise(n_of_flights = n())
# months_list%>%
# pander(big.mark = ',', justify = c('left','center', 'right'))
kable(months_list, "html") %>%
kable_styling("striped","hover") %>%
scroll_box(width = "500px", height = "600px")
| month | origin | n_of_flights |
|---|---|---|
| 1 | EWR | 9893 |
| 1 | JFK | 9161 |
| 1 | LGA | 7950 |
| 2 | EWR | 9107 |
| 2 | JFK | 8421 |
| 2 | LGA | 7423 |
| 3 | EWR | 10420 |
| 3 | JFK | 9697 |
| 3 | LGA | 8717 |
| 4 | EWR | 10531 |
| 4 | JFK | 9218 |
| 4 | LGA | 8581 |
| 5 | EWR | 10592 |
| 5 | JFK | 9397 |
| 5 | LGA | 8807 |
| 6 | EWR | 10175 |
| 6 | JFK | 9472 |
| 6 | LGA | 8596 |
| 7 | EWR | 10475 |
| 7 | JFK | 10023 |
| 7 | LGA | 8927 |
| 8 | EWR | 10359 |
| 8 | JFK | 9983 |
| 8 | LGA | 8985 |
| 9 | EWR | 9550 |
| 9 | JFK | 8908 |
| 9 | LGA | 9116 |
| 10 | EWR | 10104 |
| 10 | JFK | 9143 |
| 10 | LGA | 9642 |
| 11 | EWR | 9707 |
| 11 | JFK | 8710 |
| 11 | LGA | 8851 |
| 12 | EWR | 9922 |
| 12 | JFK | 9146 |
| 12 | LGA | 9067 |
What are the top-10 destinations and how many flights were made to these?
# The top-10 destinations and number of flights to these.
top10 <-flights%>%
group_by(dest)%>% na.omit()%>%
summarise(n_flight= n())%>%
arrange(desc(n_flight))%>%
select( dest, n_flight)%>%
head(10)
## Warning: package 'bindrcpp' was built under R version 3.4.2
# List of top-10 destinations and number of flights to these.
top10%>%
pander(big.mark = '.', decimal.mark = ',', justify = c('left', 'right'))
| dest | n_flight |
|---|---|
| ATL | 16.837 |
| ORD | 16.566 |
| LAX | 16.026 |
| BOS | 15.022 |
| MCO | 13.967 |
| CLT | 13.674 |
| SFO | 13.173 |
| FLL | 11.897 |
| MIA | 11.593 |
| DCA | 9.111 |
For these 10 destinations, make a barplot illustrating the number of flights from origin to top-10 destination.
nfly_to_top10 <-top10%>%
ggplot(aes(dest, n_flight))+
geom_col() +
labs(x = "Destinations",
y = "Number of flights",
title = "flights to top-10 destinations")
# ggplotly(nfly_to_top10) # Make it interactive
nfly_to_top10
Now, order the bars (destinations) according to the total number of flights to these destinations.
nfly_to_top10 <- top10 %>%
ggplot( aes(x= reorder(dest,-n_flight),n_flight))+
geom_bar(stat = "identity")+
labs(x = "Top-10 destinations",
y = "Number of flights",
title = "Number of flights")
nfly_to_top10
#ggplotly(nfly_to_top10)
Is the mean air time from each of the three origins different? Further, are these differences statistically significant?
flight_air_time <- flights %>%
filter(!is.na(air_time)) %>%
select(air_time, origin)
flight_air_time %>%
aov(air_time ~ origin, data = .) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## origin 2 1.935e+08 96741314 11817 <2e-16 ***
## Residuals 327343 2.680e+09 8186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualize the mean air time for all origins to see the differences between them.
m_air_time_month <- flights%>%
group_by(origin, month, day)%>%
summarise(mean_air = mean(air_time, na.rm = TRUE))%>%
arrange(desc(mean_air))
m_air_plot <- m_air_time_month%>%
ggplot(aes(origin, mean_air))+ theme_bw() +
geom_boxplot()+
labs(x = "Origin",
y = "Mean of air times",
title = "Origin mean time")
m_air_plot
#ggplotly(m_air_time_month)
How many weather observations are there for each origin?
# Compute the number of observations for each origin.
weather%>%
group_by(origin)%>% na.omit()%>%
summarise(n_observ = n())%>% # Number of observations
arrange(desc(n_observ))%>%
pander(big.mark = ',', justify = c('left', 'right'))
| origin | n_observ |
|---|---|
| JFK | 7,836 |
| LGA | 7,624 |
| EWR | 7,561 |
Convert temperature to degrees Celsius. This will be used in the reminder of this miniproject. (We do this for both temp and dewp using mutate_at)
# Convert degree fahrenheit to degree celcius.
fahrenheit_to_celcius <- function(df)
{
t_celcius <- (df - 32)* 5/9
return(t_celcius)
}
# Convert temp and dewp to degree selcius.
weather_dc <- mutate_at(weather, vars(temp,dewp),funs(fahrenheit_to_celcius))
head(weather_dc)
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 2.8 -5.6 53.97 230 10.35702
## 2 EWR 2013 1 1 1 2.8 -5.6 53.97 230 13.80936
## 3 EWR 2013 1 1 2 3.3 -5.6 52.09 230 12.65858
## 4 EWR 2013 1 1 3 3.3 -5.0 54.51 230 13.80936
## 5 EWR 2013 1 1 4 3.3 -4.4 57.04 240 14.96014
## 6 EWR 2013 1 1 6 3.9 -3.3 59.37 270 10.35702
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
# convert variables origin and month to fator.
weather_dc$origin <- as.factor(weather_dc$origin)
weather_dc$month <- as.factor(weather_dc$month)
Construct a graph displaying the temperature at JFK.
jfk_temp <- weather_dc%>%
group_by(year, month, day)%>%
filter(origin =="JFK")%>%
ggplot(aes(month, temp))+
geom_boxplot() +
labs(x = "Months",
y = "Temperature in degree C",
title = "Graph temperature at JFK") +
scale_x_discrete(labels = v_months)
jfk_temp
# ggplotly(jfk_temp)
Add a red line showing the mean temperature for each day.
l_mean_temp <- weather_dc%>% # with temp in degree celcius
mutate(mydate = make_date(year, month, day))%>%
group_by(year,month, mydate)%>%
filter(origin =="JFK")%>%
summarise(m_temp = mean(temp))%>%
ggplot(aes(mydate, m_temp))+
geom_point()+
geom_smooth(se = FALSE, col = "red", method = 'loess') +
labs(x = "Temperature",
y = "days",
title = "Mean temperature for each day")
l_mean_temp
# ggplotly(l_mean_temp)
Now, visualize the daily mean temperature for each origin airport.
mean_temp_orign <- weather_dc%>%
group_by(origin,year,month, day)%>%
summarise(mean_temp = mean(temp, na.rm = TRUE))%>%
ggplot(aes(origin,mean_temp))+
geom_boxplot(aes(fill = origin))+
#geom_point()+
labs(x = "Origin airport ",
y = "Mean temperature",
title = "Mean temperature ")
mean_temp_orign
#ggplotly(mean_temp_orign)
Investigate if arrival delay is associated with the flight distance (and also departure delay).
In order to investigate if there is any statistical dependence or association between some variables, one can calculate the correlation between these variables. To do so, we will first investigate if these variables are numeric and if there is no missing values.
select_cols <- flights%>%
select(arr_delay, dep_delay, distance) # Select relevant columns from flights
#Take complete cases.
flights_complet_cases <- select_cols[complete.cases(select_cols),]
Calculate the correlation between variables
# Correlation test between different variables or columns.
cor_test1 <- cor(flights_complet_cases)
cor_test_tb <- as.data.frame(cor_test1)
# cor_test_tb
Format the correlation data
kable(cor_test_tb, "html") %>% # format correlation table.
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered", "responsive"))
| arr_delay | dep_delay | distance | |
|---|---|---|---|
| arr_delay | 1.0000000 | 0.9148028 | -0.0618678 |
| dep_delay | 0.9148028 | 1.0000000 | -0.0216809 |
| distance | -0.0618678 | -0.0216809 | 1.0000000 |
We can also plot the correlation data to visualize the relation between variables.
corrplot(cor_test1, method="color", type = "upper",
tl.srt = 45,addCoef.col = "black")
As we can see from the different graphs and correlation parameters , there is a strong correlation between arrival delay and departure delay. In the contrast, there is a negative correlation between the distance and arrival delay. In the next step, I will build a linear regression model using these three variables.
Linear model arrival delay and departure delay
# Model arrival delay and departure delay
del_model1 <- lm(arr_delay ~ dep_delay, data = flights_complet_cases)
coef(del_model1)
## (Intercept) dep_delay
## -5.899493 1.019093
summary(del_model1)
##
## Call:
## lm(formula = arr_delay ~ dep_delay, data = flights_complet_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.587 -11.005 -1.883 8.938 201.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.8994935 0.0330195 -178.7 <2e-16 ***
## dep_delay 1.0190929 0.0007864 1295.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.03 on 327344 degrees of freedom
## Multiple R-squared: 0.8369, Adjusted R-squared: 0.8369
## F-statistic: 1.679e+06 on 1 and 327344 DF, p-value: < 2.2e-16
from this result, one can conclude that departure delay is an important predictor of arrival delay. The changes of departure delay can explain the changes in arrival delay.
pred_plot <- flights_complet_cases%>%
add_predictions(del_model1)%>%
add_residuals(del_model1)%>%
ggplot(aes(dep_delay, pred))+
geom_line()+
geom_line(aes(dep_delay, arr_delay), linetype = "dashed", col = "red")
pred_plot
# Graph arr_delay vs dep_dealy using intercept and slope from the model.
flights_complet_cases%>%
ggplot(aes(arr_delay, dep_delay))+
geom_point()+
geom_abline (intercept = -5.89, slope = 1, col= "red") +
labs(x = "arrivel delay ",
y = "departure delay",
title = "arrival vs departure delay ")
Model arrival delay, departure delay and distance
# Model arrival delay, departure delay and distance
del_model2 <- lm(arr_delay ~ dep_delay+ distance, data = flights_complet_cases)
coef(del_model2)
## (Intercept) dep_delay distance
## -3.212779441 1.018077208 -0.002550586
summary(del_model2)
##
## Call:
## lm(formula = arr_delay ~ dep_delay + distance, data = flights_complet_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.944 -11.069 -2.019 8.728 205.562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.213e+00 5.560e-02 -57.78 <2e-16 ***
## dep_delay 1.018e+00 7.823e-04 1301.32 <2e-16 ***
## distance -2.551e-03 4.259e-05 -59.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.93 on 327343 degrees of freedom
## Multiple R-squared: 0.8386, Adjusted R-squared: 0.8386
## F-statistic: 8.506e+05 on 2 and 327343 DF, p-value: < 2.2e-16
From this result, one can conclude that the changes of arrival delay can not be explained by the distance.
Investigate if departure delay is associated with weather conditions at the origin airport. This includes descriptives (mean departure delay), plotting, regression modelling, considering missing values etc.
# We selct the relevant columns from both flights and weather datasets
# then join the two subsets.
flight_delay <- flights%>%
select(year:day, origin, hour, arr_delay, dep_delay, tailnum)
#flight_delay
Join subset flights and planes datasets
fly_w_cd <- inner_join(flight_delay, weather_dc,
by = c("origin" = "origin",
"year" = "year",
"month" = "month",
"day" = "day",
"hour" = "hour"))
#c("year", "day","month", "hour", "origin")
# Here I consider only the completecases from dataset.
fly_weather_cd <- fly_w_cd[complete.cases(fly_w_cd),]
head(fly_weather_cd)
## # A tibble: 6 x 18
## year month day origin hour arr_delay dep_delay tailnum temp dewp
## <dbl> <fctr> <int> <fctr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 2013 1 1 LGA 6 -25 -6 N668DN 4.4 -3.3
## 2 2013 1 1 EWR 6 19 -5 N516JB 3.9 -3.3
## 3 2013 1 1 LGA 6 -14 -3 N829AS 4.4 -3.3
## 4 2013 1 1 JFK 6 -8 -3 N593JB 3.9 -3.3
## 5 2013 1 1 LGA 6 8 -2 N3ALAA 4.4 -3.3
## 6 2013 1 1 JFK 6 -2 -2 N793JB 3.9 -3.3
## # ... with 8 more variables: humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
Now we can compute the correlation between numeric relevant variables.
num_cols <- sapply(fly_weather_cd, is.numeric)
fly_weather_cd <- fly_weather_cd[, num_cols] # numeric columns
delay_weather_cd <- fly_weather_cd%>%
select(dep_delay, humid,precip, visib,
wind_dir, wind_speed, temp, dewp)
cor_delay_wc <- cor(delay_weather_cd)
cor_delay_wc
## dep_delay humid precip visib wind_dir
## dep_delay 1.00000000 0.01626939 0.04982871 -0.06474770 -0.01888189
## humid 0.01626939 1.00000000 0.18760812 -0.45766077 -0.30772261
## precip 0.04982871 0.18760812 1.00000000 -0.33617445 -0.07022729
## visib -0.06474770 -0.45766077 -0.33617445 1.00000000 0.17848803
## wind_dir -0.01888189 -0.30772261 -0.07022729 0.17848803 1.00000000
## wind_speed 0.03304020 -0.15572301 0.01767296 0.04653220 0.17852744
## temp 0.09473431 0.07142219 -0.02406459 0.02821394 -0.10464847
## dewp 0.08817979 0.49954113 0.04886785 -0.14709823 -0.22306808
## wind_speed temp dewp
## dep_delay 0.03304020 0.09473431 0.08817979
## humid -0.15572301 0.07142219 0.49954113
## precip 0.01767296 -0.02406459 0.04886785
## visib 0.04653220 0.02821394 -0.14709823
## wind_dir 0.17852744 -0.10464847 -0.22306808
## wind_speed 1.00000000 -0.06833782 -0.12665952
## temp -0.06833782 1.00000000 0.89586845
## dewp -0.12665952 0.89586845 1.00000000
We can Plot the correlation data to see the relations between variables
corrplot(cor_delay_wc, method="color", type = "upper",
tl.srt = 45,addCoef.col = "black")
#corrplot(cor_delay_wc, method = 'number')
According to correlation plots, there is no strong correlation between departure delay and any of the weather conditions variables and there is a tight competition between positive and negative correlations. To get more information we will visualize the relationship between delay and some of the variable such as temp and `precip.
fly_w_cd%>%
group_by(precip)%>%
summarise (dp_delay = mean(dep_delay, na.rm = TRUE))%>%
ggplot(aes(precip, dp_delay))+
geom_line()+
geom_point()
As we can see from the graph, the precip doesn’t provide more information explanation about departure delay and the correlation between these two variables is weak.
fly_w_cd%>%
group_by(temp)%>%na.omit()%>%
summarise (dp_delay = mean(dep_delay, na.rm = TRUE))%>%
ggplot(aes(temp, dp_delay))+
# geom_line(col = "red")+
geom_point()+
geom_smooth( se = FALSE, method = 'loess')
Surprisingly, can that the departure delay increase with temperature but this variation is not significant to provide more explanation about departure delay changes. In the next step I will build the linear regression model.
Modelling weather conditions delay model
set.seed(101)
#delay_weather_cd%>%
wc_delay_sample <- sample.split(delay_weather_cd$dep_delay, SplitRatio = 0.7)
# train set
train <- subset(delay_weather_cd, wc_delay_sample == TRUE)
head(train)
## # A tibble: 6 x 8
## dep_delay humid precip visib wind_dir wind_speed temp dewp
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -6 57.33 0 10 260 13.80936 4.4 -3.3
## 2 -3 57.33 0 10 260 13.80936 4.4 -3.3
## 3 -3 59.37 0 10 260 12.65858 3.9 -3.3
## 4 -2 59.37 0 10 260 12.65858 3.9 -3.3
## 5 -2 59.37 0 10 260 12.65858 3.9 -3.3
## 6 -1 57.33 0 10 260 13.80936 4.4 -3.3
#Test set
test <- subset(delay_weather_cd, wc_delay_sample == FALSE)
head(test)
## # A tibble: 6 x 8
## dep_delay humid precip visib wind_dir wind_speed temp dewp
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -5 59.37 0 10 270 10.35702 3.9 -3.3
## 2 -2 57.33 0 10 260 13.80936 4.4 -3.3
## 3 -2 59.37 0 10 260 12.65858 3.9 -3.3
## 4 -2 59.37 0 10 270 10.35702 3.9 -3.3
## 5 0 57.33 0 10 260 13.80936 4.4 -3.3
## 6 0 57.33 0 10 260 13.80936 4.4 -3.3
Build linear model
model <- lm(dep_delay~ ., data = train)
summary(model)
##
## Call:
## lm(formula = dep_delay ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.36 -16.23 -11.24 -1.52 1297.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.6275534 2.7821737 16.759 < 2e-16 ***
## humid -0.2886026 0.0281757 -10.243 < 2e-16 ***
## precip 91.4480263 6.2893796 14.540 < 2e-16 ***
## visib -1.8495357 0.0683436 -27.062 < 2e-16 ***
## wind_dir -0.0037708 0.0008162 -4.620 3.84e-06 ***
## wind_speed 0.1414720 0.0073312 19.297 < 2e-16 ***
## temp -0.4577991 0.0963228 -4.753 2.01e-06 ***
## dewp 0.9032992 0.1037279 8.708 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.64 on 200090 degrees of freedom
## Multiple R-squared: 0.0172, Adjusted R-squared: 0.01716
## F-statistic: 500.2 on 7 and 200090 DF, p-value: < 2.2e-16
# Residuals
residus <- residuals(model)
residus <- as.data.frame(residus)
#head(residus)
residus%>%
ggplot(aes(residus)) + geom_histogram(fill='blue',alpha=0.5, bins = 20)
Plotting model
Prediction computation
delay_predict <- predict(model, test)
predict_res <- cbind(delay_predict, test$dep_delay)
colnames(predict_res) <- c("predict", "actual")
predict_res<- as.data.frame(predict_res)
head(predict_res)
## predict actual
## 1 6.678672 -5
## 2 7.564640 -2
## 3 7.041987 -2
## 4 6.678672 -2
## 5 7.564640 0
## 6 7.564640 0
head(delay_predict)
## 1 2 3 4 5 6
## 6.678672 7.564640 7.041987 6.678672 7.564640 7.564640
Is the age of the plane associated to delay?
Here, we start by computing the age of each and store it in new column names age.
# This function compute the age the plane as integer.
year_diff <- function(x_year, y_year = as.integer(format(Sys.Date(), "%Y")))
{
x_year <- as.integer(x_year)
return(y_year - x_year)
}
pl_ages <- planes%>%
mutate(age = year_diff(year))%>%
select(tailnum, age, seats)
pl_ages <- pl_ages[complete.cases(pl_ages),]
flight_delay <- flights%>%
select(year:day, origin, hour, arr_delay, dep_delay, tailnum)
# join planes and flights datasets
join_fly_planes <- flight_delay%>%
inner_join(pl_ages, by = "tailnum")
#join_fly_planes
fly_planes <-join_fly_planes%>%
select(age,dep_delay,arr_delay)
fly_planes <-fly_planes[complete.cases(fly_planes),]
fly_planes
## # A tibble: 273,853 x 3
## age dep_delay arr_delay
## <int> <dbl> <dbl>
## 1 19 2 11
## 2 20 4 20
## 3 28 2 33
## 4 6 -1 -18
## 5 27 -6 -25
## 6 6 -4 12
## 7 18 -5 19
## 8 20 -3 -14
## 9 14 -3 -8
## 10 7 -2 -2
## # ... with 273,843 more rows
Now I will perform some plotting to visualize the graphic (age vs delay)
age_delay <- fly_planes%>%
group_by(age)%>%
filter(!is.na(dep_delay)) %>%
summarise(delay = mean(dep_delay)) %>%
ggplot(aes(x = age, y = delay)) +
# geom_point() +
geom_line() +
labs(x = "Age of planes",
y = "Mean of departure delay",
title = "Graph dep_delay vs age")
age_delay
#ggplotly(age_delay)
try correlation between age of planes and arrival delay.
cor_delay_age <-cor(fly_planes)
cor_delay_age
## age dep_delay arr_delay
## age 1.00000000 -0.01513102 -0.01767153
## dep_delay -0.01513102 1.00000000 0.91674159
## arr_delay -0.01767153 0.91674159 1.00000000
According to the correlation table, There is a weak negative correlation between age and both arrival delay and departure delay. Meaning, the age of the planes can not explained the changes of delay.
Format the correlation data
| age | dep_delay | arr_delay | |
|---|---|---|---|
| age | 1.0000000 | -0.0151310 | -0.0176715 |
| dep_delay | -0.0151310 | 1.0000000 | 0.9167416 |
| arr_delay | -0.0176715 | 0.9167416 | 1.0000000 |
It seems like the plane manufacturer could use a cleaning. After that, how many manufacturers have more than 200 planes? And how many flights are each manufacturer with more than 200 planes responsible for?
# Find list of manufacturers that have more than 200 planes.
manufact_200_planes <- planes%>%
filter(!is.na(manufacturer)) %>%
group_by( manufacturer)%>%
summarise(n_planes =n())%>%
arrange(desc(n_planes))%>%
filter(n_planes > 200)
#print list
manufact_200_planes%>%
pander(big.mark = ',', justify = c('left', 'right'))
| manufacturer | n_planes |
|---|---|
| BOEING | 1,630 |
| AIRBUS INDUSTRIE | 400 |
| BOMBARDIER INC | 368 |
| AIRBUS | 336 |
| EMBRAER | 299 |
# Combine the subset manufact_200_planes with flights dataset to find number of flights.
num_planes200 <- planes%>%
inner_join(flights,"tailnum")%>%
filter(manufacturer %in%
manufact_200_planes$manufacturer)%>%
group_by( manufacturer)%>%
summarise(n_flight =n())%>%
arrange(desc(n_flight))
# print list
num_planes200%>%
pander(big.mark = ',', justify = c('left', 'right'))
| manufacturer | n_flight |
|---|---|
| BOEING | 82,912 |
| EMBRAER | 66,068 |
| AIRBUS | 47,302 |
| AIRBUS INDUSTRIE | 40,891 |
| BOMBARDIER INC | 28,272 |
# format number of flights
kable(num_planes200, "html") %>% # format correlation table.
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", "bordered"))
| manufacturer | n_flight |
|---|---|
| BOEING | 82912 |
| EMBRAER | 66068 |
| AIRBUS | 47302 |
| AIRBUS INDUSTRIE | 40891 |
| BOMBARDIER INC | 28272 |
It turns out that Airbus has several main models, e.g. several A320 (A320-211, A320-212 etc.) and so on. Create a frequency table of the main models for Airbus and how many planes there are in each.
# Find number of model for "AIRBUS"
airbus_models <- planes%>%
group_by(model)%>%
filter(manufacturer =="AIRBUS")%>%
summarise(n_of_planes = n())%>%
arrange(desc(n_of_planes))
# Split column model to find main models
#airbus_models%>%
split_m <- strsplit( airbus_models$model, "-")
main_mdl <- vector("character", length = length(split_m))
for (m in 1: length(split_m)) {
main_mdl[m] <- split_m[[m]][1]
}
# List of main models
m_models <- airbus_models%>%
mutate(main_model = main_mdl)
# m_models %>%
# pander(big.mark = ',', justify = c('left','center', 'right'))
# format list
kable(m_models, "html") %>% # format correlation table.
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", "bordered"))
| model | n_of_planes | main_model |
|---|---|---|
| A320-232 | 129 | A320 |
| A320-214 | 61 | A320 |
| A321-231 | 49 | A321 |
| A319-114 | 31 | A319 |
| A321-211 | 18 | A321 |
| A319-112 | 14 | A319 |
| A330-243 | 14 | A330 |
| A319-131 | 5 | A319 |
| A319-132 | 5 | A319 |
| A319-115 | 3 | A319 |
| A320-211 | 2 | A320 |
| A320-212 | 2 | A320 |
| A330-223 | 2 | A330 |
| A330-323 | 1 | A330 |
# A Frequency table of main models
m_models%>%
group_by(main_model)%>%
summarise( num_model = n(), num_of_planes = sum(n_of_planes))%>%
pander(big.mark = ',', justify = c('left','center', 'right'))
| main_model | num_model | num_of_planes |
|---|---|---|
| A319 | 5 | 58 |
| A320 | 4 | 194 |
| A321 | 2 | 67 |
| A330 | 3 | 17 |
Are larger planes (measured by number of seats) more or less delayed than smaller planes? To answer this question one can visualize the plot (seats vs arr_delay)
delay_seats <- join_fly_planes%>%
group_by(seats)%>%
summarise(mean_delay = mean(arr_delay, na.rm = TRUE))%>%
arrange(desc(seats))
#delay_seats%>%
#pander(big.mark = ',', justify = c('left', 'right'))
kable(delay_seats, "html") %>%
kable_styling() %>%
scroll_box(width = "500px", height = "600px")
| seats | mean_delay |
|---|---|
| 450 | 120.0000000 |
| 400 | 14.3333333 |
| 379 | 3.2904795 |
| 377 | -6.6354839 |
| 330 | 5.2810127 |
| 300 | 24.2000000 |
| 292 | 6.2159091 |
| 290 | 7.7356322 |
| 275 | 11.9455645 |
| 269 | -4.7500000 |
| 260 | -4.5000000 |
| 255 | -1.1056853 |
| 222 | -8.8192771 |
| 200 | 7.5991997 |
| 199 | 2.6828016 |
| 191 | 0.8033501 |
| 189 | -1.3370618 |
| 182 | 3.2215930 |
| 179 | 4.2978482 |
| 178 | 2.2861217 |
| 172 | -0.8459043 |
| 149 | 3.3884528 |
| 147 | 33.4731183 |
| 145 | 0.3995519 |
| 142 | 5.9049074 |
| 140 | 9.6505463 |
| 139 | 2.3406593 |
| 128 | -3.0526316 |
| 102 | 3.5000000 |
| 100 | 17.7885978 |
| 95 | 8.0389666 |
| 80 | 8.2470027 |
| 55 | 15.8059566 |
| 22 | 7.1729958 |
| 20 | 8.0563663 |
| 16 | -4.2950820 |
| 14 | -1.3461538 |
| 12 | -10.2500000 |
| 11 | 22.4347826 |
| 10 | 5.6666667 |
| 9 | 4.0454545 |
| 8 | 7.8216561 |
| 7 | -3.0338983 |
| 6 | 10.6735395 |
| 5 | 7.1370717 |
| 4 | 6.3654917 |
| 2 | 5.4972527 |
According to this graph, there is no sign that the larger planes are more delayed than the smaller ones.
One can also try to compute the correlation between age, arr_delay and dep_delay.
| seats | arr_delay | dep_delay | |
|---|---|---|---|
| seats | 1.0000000 | -0.0726318 | -0.0541665 |
| arr_delay | -0.0726318 | 1.0000000 | 0.9167416 |
| dep_delay | -0.0541665 | 0.9167416 | 1.0000000 |
There is a weak negative correlation between a number of seats and delay meaning that the number of seats (smaller/ larger) can not explain the arrival delay.
##
## Call:
## lm(formula = seat_delay$seats ~ seat_delay$arr_delay + seat_delay$dep_delay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.48 -75.35 10.74 50.37 351.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.892496 0.151073 912.75 <2e-16 ***
## seat_delay$arr_delay -0.229291 0.007595 -30.19 <2e-16 ***
## seat_delay$dep_delay 0.137780 0.008443 16.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71.33 on 273850 degrees of freedom
## Multiple R-squared: 0.006242, Adjusted R-squared: 0.006234
## F-statistic: 860 on 2 and 273850 DF, p-value: < 2.2e-16
## (Intercept) seat_delay$arr_delay seat_delay$dep_delay
## 137.8924960 -0.2292909 0.1377797
On a map (map_data("usa")), plot the airports that has flights to them.
usa <- map_data("usa")
#airports$faa <- as.factor(airports$faa)
fli_dest <- flights%>%
group_by(dest)%>%
summarise(n = n())%>%
inner_join(airports, by =c(dest = "faa"))
## Warning: Column `dest`/`faa` joining factor and character vector, coercing
## into character vector
fli_dest
## # A tibble: 101 x 9
## dest n name lat lon alt
## <chr> <int> <chr> <dbl> <dbl> <int>
## 1 ABQ 254 Albuquerque International Sunport 35.04022 -106.60919 5355
## 2 ACK 265 Nantucket Mem 41.25305 -70.06018 48
## 3 ALB 439 Albany Intl 42.74827 -73.80169 285
## 4 ANC 8 Ted Stevens Anchorage Intl 61.17436 -149.99636 152
## 5 ATL 17215 Hartsfield Jackson Atlanta Intl 33.63672 -84.42807 1026
## 6 AUS 2439 Austin Bergstrom Intl 30.19453 -97.66989 542
## 7 AVL 275 Asheville Regional Airport 35.43619 -82.54181 2165
## 8 BDL 443 Bradley Intl 41.93889 -72.68322 173
## 9 BGR 375 Bangor Intl 44.80744 -68.82814 192
## 10 BHM 297 Birmingham Intl 33.56294 -86.75355 644
## # ... with 91 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>
Make a similar plot, but now points must have size relative to the number of flights each airport is destination for.
map_data <- ggplot() +
geom_polygon(data = usa, aes(long, lat, group = group), fill = "grey") +
geom_point(data =fli_dest, aes(x = lon, y = lat, size = n , color = n)) +
borders("state")+
scale_color_viridis()+
labs(x = "longitude",
y = "latitude",
title = "map flights for destinations")+
coord_quickmap()
map_data
#ggplotly(map_data)
Do a principal component analysis of the weather at JFK using the following columns: temp, dewp, humid, wind_dir, wind_speed, precip, visib (only on complete.cases()). How many principal components should be used to capture the variability in the weather data?
weather_dc <- mutate_at(weather, vars(temp, dewp),funs(fahrenheit_to_celcius))
weather_jfk_df <- weather_dc%>%
group_by(origin)%>%
filter(origin == "JFK") %>%
select(origin,temp, dewp, humid, wind_dir, wind_speed, precip, visib)
head(weather_jfk_df)
## # A tibble: 6 x 8
## # Groups: origin [1]
## origin temp dewp humid wind_dir wind_speed precip visib
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 JFK 3.3 -5.0 54.51 240 16.11092 0 10
## 2 JFK 3.3 -4.4 57.04 250 17.26170 0 10
## 3 JFK 3.9 -3.9 56.77 240 19.56326 0 10
## 4 JFK 3.9 -3.3 59.37 240 18.41248 0 10
## 5 JFK 3.9 -3.9 56.77 260 14.96014 0 10
## 6 JFK 3.9 -3.3 59.37 260 12.65858 0 10
# Here we consider the completecases
weather_jfk<- weather_jfk_df[complete.cases(weather_jfk_df),]
weather_jfk <- weather_jfk[, sapply(weather_jfk, is.numeric)]
summary and correlation plot
Principal component analysis PCA
## # A tibble: 6 x 7
## temp dewp humid wind_dir wind_speed precip visib
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.3 -5.0 54.51 240 16.11092 0 10
## 2 3.3 -4.4 57.04 250 17.26170 0 10
## 3 3.9 -3.9 56.77 240 19.56326 0 10
## 4 3.9 -3.3 59.37 240 18.41248 0 10
## 5 3.9 -3.9 56.77 260 14.96014 0 10
## 6 3.9 -3.3 59.37 260 12.65858 0 10
## temp dewp humid wind_dir
## Min. :-11.10 Min. :-23.300 Min. : 15.21 Min. : 0.0
## 1st Qu.: 4.40 1st Qu.: -2.800 1st Qu.: 49.19 1st Qu.:130.0
## Median : 12.20 Median : 6.000 Median : 65.80 Median :220.0
## Mean : 12.43 Mean : 5.441 Mean : 65.11 Mean :202.9
## 3rd Qu.: 20.60 3rd Qu.: 14.400 3rd Qu.: 82.45 3rd Qu.:290.0
## Max. : 36.70 Max. : 25.600 Max. :100.00 Max. :360.0
## wind_speed precip visib
## Min. : 0.000 Min. :0.000000 Min. : 0.000
## 1st Qu.: 6.905 1st Qu.:0.000000 1st Qu.:10.000
## Median :10.357 Median :0.000000 Median :10.000
## Mean :11.390 Mean :0.002731 Mean : 9.125
## 3rd Qu.:14.960 3rd Qu.:0.000000 3rd Qu.:10.000
## Max. :42.579 Max. :0.530000 Max. :10.000
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6176 1.2088 1.0637 0.8780 0.78271 0.63604 0.04986
## Proportion of Variance 0.3738 0.2087 0.1616 0.1101 0.08752 0.05779 0.00036
## Cumulative Proportion 0.3738 0.5826 0.7442 0.8543 0.94185 0.99964 1.00000
Accordingly, to the different plots and summary table, one can see that more than 94% of weather variability can be captured by 5 PCs. So we choose the 5 first components as principal ones.