library(tidyverse)
library(lubridate)
library(pander) # for prettier tables
library(scales) # for making prettier axes in plots
library(stringr)
## added
##########################
library(plotly)
library(devtools)
library(maps)
library(corrgram)
library(corrplot)
library(knitr)
library(kableExtra)
library(caTools)
library(modelr)
library(GGally)
library(viridis)
library(factoextra)
##########################

theme_set(theme_bw())

panderOptions('big.mark', ',')

1 Formalia

Deadline for hand-in: Jan 3, 2018 at 23:55.

Where: Moodle.

What: Rmd file. Possibly also pdf (or html), but Rmd is mandatory.

Groups: Maximum 3 participants, however the project must be handed in individually.

2 Exercises

Here, we focus on the airlines, airports, flights, planes, and weather datasets:

## Warning: package 'nycflights13' was built under R version 3.4.3

## # A tibble: 16 x 2
##    carrier                        name
##      <chr>                       <chr>
##  1      9E           Endeavor Air Inc.
##  2      AA      American Airlines Inc.
##  3      AS        Alaska Airlines Inc.
##  4      B6             JetBlue Airways
##  5      DL        Delta Air Lines Inc.
##  6      EV    ExpressJet Airlines Inc.
##  7      F9      Frontier Airlines Inc.
##  8      FL AirTran Airways Corporation
##  9      HA      Hawaiian Airlines Inc.
## 10      MQ                   Envoy Air
## 11      OO       SkyWest Airlines Inc.
## 12      UA       United Air Lines Inc.
## 13      US             US Airways Inc.
## 14      VX              Virgin America
## 15      WN      Southwest Airlines Co.
## 16      YV          Mesa Airlines Inc.

## # A tibble: 1,458 x 8
##      faa                           name      lat        lon   alt    tz
##    <chr>                          <chr>    <dbl>      <dbl> <int> <dbl>
##  1   04G              Lansdowne Airport 41.13047  -80.61958  1044    -5
##  2   06A  Moton Field Municipal Airport 32.46057  -85.68003   264    -6
##  3   06C            Schaumburg Regional 41.98934  -88.10124   801    -6
##  4   06N                Randall Airport 41.43191  -74.39156   523    -5
##  5   09J          Jekyll Island Airport 31.07447  -81.42778    11    -5
##  6   0A9 Elizabethton Municipal Airport 36.37122  -82.17342  1593    -5
##  7   0G6        Williams County Airport 41.46731  -84.50678   730    -5
##  8   0G7  Finger Lakes Regional Airport 42.88356  -76.78123   492    -5
##  9   0P2   Shoestring Aviation Airfield 39.79482  -76.64719  1000    -5
## 10   0S9          Jefferson County Intl 48.05381 -122.81064   108    -8
## # ... with 1,448 more rows, and 2 more variables: dst <chr>, tzone <chr>

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

## # A tibble: 3,322 x 9
##    tailnum  year                    type     manufacturer     model
##      <chr> <int>                   <chr>            <chr>     <chr>
##  1  N10156  2004 Fixed wing multi engine          EMBRAER EMB-145XR
##  2  N102UW  1998 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  3  N103US  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  4  N104UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  5  N10575  2002 Fixed wing multi engine          EMBRAER EMB-145LR
##  6  N105UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  7  N107US  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  8  N108UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
##  9  N109UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
## 10  N110UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214
## # ... with 3,312 more rows, and 4 more variables: engines <int>,
## #   seats <int>, speed <int>, engine <chr>

## # A tibble: 26,130 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##     <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1    EWR  2013     1     1     0 37.04 21.92 53.97      230   10.35702
##  2    EWR  2013     1     1     1 37.04 21.92 53.97      230   13.80936
##  3    EWR  2013     1     1     2 37.94 21.92 52.09      230   12.65858
##  4    EWR  2013     1     1     3 37.94 23.00 54.51      230   13.80936
##  5    EWR  2013     1     1     4 37.94 24.08 57.04      240   14.96014
##  6    EWR  2013     1     1     6 39.02 26.06 59.37      270   10.35702
##  7    EWR  2013     1     1     7 39.02 26.96 61.63      250    8.05546
##  8    EWR  2013     1     1     8 39.02 28.04 64.43      240   11.50780
##  9    EWR  2013     1     1     9 39.92 28.04 62.21      250   12.65858
## 10    EWR  2013     1     1    10 39.02 28.04 64.43      260   12.65858
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>

Remember to read about the datasets.

3 Exercises

3.1 Exercise

Construct a barplot displaying number of flights per month.

# Here, we convert these variables to the categorical ones.

flights$month <- as.factor(flights$month)
flights$origin <- as.factor(flights$origin)
flights$dest <- as.factor(flights$dest)

planes$manufacturer <-as.factor(planes$manufacturer)

# Vector Name of months

v_months <- c("1" = "Jan", "2" = "Feb","3" = "Mar", 
              "4" = "Apr", "5" = "May", "6" = "Jun",
              "7" = "Jul", "8" = "Aug","9" = "sep",                 
              "10" = "Oct","11" = "Nov", "12" = "Dec")

Find the number of flights per month.

 flights%>%
  group_by(month)%>% na.omit()%>%
  summarise(n_of_flight =n())%>%
  pander(big.mark = ',', justify = c('left', 'right'))

month	n_of_flight
1	26,398
2	23,611
3	27,902
4	27,564
5	28,128
6	27,075
7	28,293
8	28,756
9	27,010
10	28,618
11	26,971
12	27,020

Now we can construct a barplot displaying number of flights per month.

nfly_per_month <- flights%>%
  group_by(year, month)%>%
  ggplot(aes(x = month))+
  geom_bar() +
  labs(x = "Months", 
       y = "Number of flights",
       title = "Number of flights per month") +
  scale_x_discrete(labels = v_months) 

nfly_per_month

 # ggplotly(nfly_per_month)  # to makee it interactive.

Now, in the barplot (showing number of flights per month), make a separate bar for each origin.

origin_flights <- flights%>%
  ggplot(aes( x = month))+
  geom_bar(aes(fill = origin), position = "dodge") +
  labs(x = "Months",
       y = "Number of flights", 
       title = "flights for each origin per month") +
  scale_x_discrete(labels = v_months)

#facet_wrap(~ origin)

# ggplotly(origin_flights) # make it interactive
  origin_flights

List of the number of flights for each origin per month

months_list <- flights%>%
  group_by(month, origin)%>%
  summarise(n_of_flights = n())


 # months_list%>%
  #  pander(big.mark = ',', justify = c('left','center', 'right'))

kable(months_list, "html") %>%
  kable_styling("striped","hover") %>%
  scroll_box(width = "500px", height = "600px")

month	origin	n_of_flights
1	EWR	9893
1	JFK	9161
1	LGA	7950
2	EWR	9107
2	JFK	8421
2	LGA	7423
3	EWR	10420
3	JFK	9697
3	LGA	8717
4	EWR	10531
4	JFK	9218
4	LGA	8581
5	EWR	10592
5	JFK	9397
5	LGA	8807
6	EWR	10175
6	JFK	9472
6	LGA	8596
7	EWR	10475
7	JFK	10023
7	LGA	8927
8	EWR	10359
8	JFK	9983
8	LGA	8985
9	EWR	9550
9	JFK	8908
9	LGA	9116
10	EWR	10104
10	JFK	9143
10	LGA	9642
11	EWR	9707
11	JFK	8710
11	LGA	8851
12	EWR	9922
12	JFK	9146
12	LGA	9067

3.2 Exercise

What are the top-10 destinations and how many flights were made to these?

# The top-10 destinations and number of flights to these.

top10 <-flights%>%
  group_by(dest)%>% na.omit()%>%
  summarise(n_flight= n())%>%
  arrange(desc(n_flight))%>%
  select( dest, n_flight)%>%
  head(10)

## Warning: package 'bindrcpp' was built under R version 3.4.2

# List of top-10 destinations and number of flights to these.

top10%>%
  pander(big.mark = '.', decimal.mark = ',', justify = c('left', 'right'))

dest	n_flight
ATL	16.837
ORD	16.566
LAX	16.026
BOS	15.022
MCO	13.967
CLT	13.674
SFO	13.173
FLL	11.897
MIA	11.593
DCA	9.111

For these 10 destinations, make a barplot illustrating the number of flights from origin to top-10 destination.

nfly_to_top10 <-top10%>%
 ggplot(aes(dest, n_flight))+
  geom_col() +
  labs(x = "Destinations",
       y = "Number of flights",
       title = "flights to top-10 destinations")

# ggplotly(nfly_to_top10) # Make it interactive
 nfly_to_top10

Now, order the bars (destinations) according to the total number of flights to these destinations.

nfly_to_top10 <- top10 %>% 
ggplot( aes(x= reorder(dest,-n_flight),n_flight))+
  geom_bar(stat = "identity")+
  labs(x = "Top-10 destinations",
       y = "Number of flights",
       title = "Number of flights")
nfly_to_top10

 #ggplotly(nfly_to_top10)

3.3 Exercise

Is the mean air time from each of the three origins different? Further, are these differences statistically significant?

flight_air_time <- flights %>% 
  filter(!is.na(air_time)) %>%
  select(air_time, origin)


flight_air_time %>%
  aov(air_time ~ origin, data = .) %>% summary()

##                 Df    Sum Sq  Mean Sq F value Pr(>F)    
## origin           2 1.935e+08 96741314   11817 <2e-16 ***
## Residuals   327343 2.680e+09     8186                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Visualize the mean air time for all origins to see the differences between them.

m_air_time_month <- flights%>%
  group_by(origin, month, day)%>%
  summarise(mean_air =  mean(air_time, na.rm = TRUE))%>%
   arrange(desc(mean_air))

 m_air_plot <- m_air_time_month%>% 
ggplot(aes(origin, mean_air))+ theme_bw() +
  geom_boxplot()+
  labs(x = "Origin",
       y = "Mean of air times",
       title = "Origin mean time")
  m_air_plot

 #ggplotly(m_air_time_month)

3.4 Exercise

How many weather observations are there for each origin?

# Compute the number of observations for each origin.
weather%>%
  group_by(origin)%>% na.omit()%>%
  summarise(n_observ = n())%>% # Number of observations
  arrange(desc(n_observ))%>%
  pander(big.mark = ',', justify = c('left', 'right'))

origin	n_observ
JFK	7,836
LGA	7,624
EWR	7,561

Convert temperature to degrees Celsius. This will be used in the reminder of this miniproject. (We do this for both temp and dewp using mutate_at)

# Convert degree fahrenheit to degree celcius.
fahrenheit_to_celcius <- function(df)
{
  t_celcius <- (df - 32)* 5/9 
    
    return(t_celcius)
}

# Convert temp and dewp to degree selcius.
weather_dc <-  mutate_at(weather,    vars(temp,dewp),funs(fahrenheit_to_celcius))

head(weather_dc)

## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1    EWR  2013     1     1     0   2.8  -5.6 53.97      230   10.35702
## 2    EWR  2013     1     1     1   2.8  -5.6 53.97      230   13.80936
## 3    EWR  2013     1     1     2   3.3  -5.6 52.09      230   12.65858
## 4    EWR  2013     1     1     3   3.3  -5.0 54.51      230   13.80936
## 5    EWR  2013     1     1     4   3.3  -4.4 57.04      240   14.96014
## 6    EWR  2013     1     1     6   3.9  -3.3 59.37      270   10.35702
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>

# convert variables  origin and month to fator. 

weather_dc$origin <- as.factor(weather_dc$origin)
weather_dc$month <- as.factor(weather_dc$month)

Construct a graph displaying the temperature at JFK.

 jfk_temp <- weather_dc%>%

  group_by(year, month, day)%>%
  filter(origin =="JFK")%>%
  ggplot(aes(month, temp))+
  geom_boxplot() +
  labs(x = "Months",
       y = "Temperature in degree C",
       title = "Graph temperature at JFK") +
  scale_x_discrete(labels = v_months)

 jfk_temp

 # ggplotly(jfk_temp)

Add a red line showing the mean temperature for each day.

l_mean_temp <- weather_dc%>% # with temp in degree celcius
 mutate(mydate = make_date(year, month, day))%>%
  group_by(year,month, mydate)%>%
 
  filter(origin =="JFK")%>%
  summarise(m_temp = mean(temp))%>%

  ggplot(aes(mydate, m_temp))+
  geom_point()+
 geom_smooth(se = FALSE, col = "red", method = 'loess') +
  labs(x = "Temperature",
       y = "days",
       title = "Mean temperature for each day")
l_mean_temp

# ggplotly(l_mean_temp)

Now, visualize the daily mean temperature for each origin airport.

mean_temp_orign <- weather_dc%>%
  group_by(origin,year,month, day)%>%
  summarise(mean_temp = mean(temp, na.rm = TRUE))%>%
  ggplot(aes(origin,mean_temp))+
  geom_boxplot(aes(fill = origin))+
  #geom_point()+
  
  labs(x = "Origin airport ",
       y = "Mean temperature",
       title = "Mean temperature ")
  
mean_temp_orign

#ggplotly(mean_temp_orign)

3.5 Exercise

Investigate if arrival delay is associated with the flight distance (and also departure delay).

In order to investigate if there is any statistical dependence or association between some variables, one can calculate the correlation between these variables. To do so, we will first investigate if these variables are numeric and if there is no missing values.

 select_cols <- flights%>%
  select(arr_delay, dep_delay, distance)  # Select relevant columns from flights

#Take complete cases. 

 flights_complet_cases <- select_cols[complete.cases(select_cols),]

Calculate the correlation between variables

# Correlation test between different variables or columns.

cor_test1 <- cor(flights_complet_cases)

cor_test_tb <- as.data.frame(cor_test1)

# cor_test_tb

Format the correlation data

   kable(cor_test_tb, "html") %>% # format correlation table.
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered", "responsive"))

	arr_delay	dep_delay	distance
arr_delay	1.0000000	0.9148028	-0.0618678
dep_delay	0.9148028	1.0000000	-0.0216809
distance	-0.0618678	-0.0216809	1.0000000

We can also plot the correlation data to visualize the relation between variables.

corrplot(cor_test1, method="color", type = "upper",
         tl.srt = 45,addCoef.col = "black")

As we can see from the different graphs and correlation parameters , there is a strong correlation between arrival delay and departure delay. In the contrast, there is a negative correlation between the distance and arrival delay. In the next step, I will build a linear regression model using these three variables.

Linear model arrival delay and departure delay

# Model arrival delay and departure delay

 del_model1 <- lm(arr_delay ~ dep_delay, data = flights_complet_cases)
coef(del_model1)

## (Intercept)   dep_delay 
##   -5.899493    1.019093

summary(del_model1)

## 
## Call:
## lm(formula = arr_delay ~ dep_delay, data = flights_complet_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -107.587  -11.005   -1.883    8.938  201.938 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.8994935  0.0330195  -178.7   <2e-16 ***
## dep_delay    1.0190929  0.0007864  1295.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.03 on 327344 degrees of freedom
## Multiple R-squared:  0.8369, Adjusted R-squared:  0.8369 
## F-statistic: 1.679e+06 on 1 and 327344 DF,  p-value: < 2.2e-16

from this result, one can conclude that departure delay is an important predictor of arrival delay. The changes of departure delay can explain the changes in arrival delay.

pred_plot <- flights_complet_cases%>%
  add_predictions(del_model1)%>%
  add_residuals(del_model1)%>%
  
  ggplot(aes(dep_delay, pred))+
  geom_line()+
  geom_line(aes(dep_delay, arr_delay), linetype = "dashed", col = "red")  

pred_plot

# Graph arr_delay vs dep_dealy using intercept and slope from the model.

flights_complet_cases%>%
  ggplot(aes(arr_delay, dep_delay))+
  geom_point()+
 geom_abline (intercept = -5.89, slope = 1, col= "red") +
  
  labs(x = "arrivel delay ",
       y = "departure delay",
       title = "arrival vs departure delay ")

Model arrival delay, departure delay and distance

# Model arrival delay, departure delay and distance
del_model2 <- lm(arr_delay ~ dep_delay+ distance, data = flights_complet_cases)
 coef(del_model2)

##  (Intercept)    dep_delay     distance 
## -3.212779441  1.018077208 -0.002550586

summary(del_model2)

## 
## Call:
## lm(formula = arr_delay ~ dep_delay + distance, data = flights_complet_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -107.944  -11.069   -2.019    8.728  205.562 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.213e+00  5.560e-02  -57.78   <2e-16 ***
## dep_delay    1.018e+00  7.823e-04 1301.32   <2e-16 ***
## distance    -2.551e-03  4.259e-05  -59.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.93 on 327343 degrees of freedom
## Multiple R-squared:  0.8386, Adjusted R-squared:  0.8386 
## F-statistic: 8.506e+05 on 2 and 327343 DF,  p-value: < 2.2e-16

From this result, one can conclude that the changes of arrival delay can not be explained by the distance.

3.6 Exercise

Investigate if departure delay is associated with weather conditions at the origin airport. This includes descriptives (mean departure delay), plotting, regression modelling, considering missing values etc.

# We selct the relevant columns from  both flights and weather datasets
# then join  the two subsets.

flight_delay <- flights%>%
  select(year:day, origin, hour, arr_delay, dep_delay, tailnum)
#flight_delay

Join subset flights and planes datasets

fly_w_cd <-  inner_join(flight_delay, weather_dc,
                        by = c("origin" = "origin",
                                 "year" = "year",
                                "month" = "month",
                                  "day" = "day",
                                 "hour" = "hour"))

#c("year", "day","month", "hour", "origin")

# Here  I consider only the completecases from dataset.

 fly_weather_cd <- fly_w_cd[complete.cases(fly_w_cd),]
 
head(fly_weather_cd)

## # A tibble: 6 x 18
##    year  month   day origin  hour arr_delay dep_delay tailnum  temp  dewp
##   <dbl> <fctr> <int> <fctr> <dbl>     <dbl>     <dbl>   <chr> <dbl> <dbl>
## 1  2013      1     1    LGA     6       -25        -6  N668DN   4.4  -3.3
## 2  2013      1     1    EWR     6        19        -5  N516JB   3.9  -3.3
## 3  2013      1     1    LGA     6       -14        -3  N829AS   4.4  -3.3
## 4  2013      1     1    JFK     6        -8        -3  N593JB   3.9  -3.3
## 5  2013      1     1    LGA     6         8        -2  N3ALAA   4.4  -3.3
## 6  2013      1     1    JFK     6        -2        -2  N793JB   3.9  -3.3
## # ... with 8 more variables: humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

Now we can compute the correlation between numeric relevant variables.

num_cols <- sapply(fly_weather_cd, is.numeric)

fly_weather_cd  <- fly_weather_cd[, num_cols] # numeric columns

delay_weather_cd <- fly_weather_cd%>%
  select(dep_delay, humid,precip, visib,
         wind_dir, wind_speed, temp, dewp)

cor_delay_wc <- cor(delay_weather_cd)

cor_delay_wc

##              dep_delay       humid      precip       visib    wind_dir
## dep_delay   1.00000000  0.01626939  0.04982871 -0.06474770 -0.01888189
## humid       0.01626939  1.00000000  0.18760812 -0.45766077 -0.30772261
## precip      0.04982871  0.18760812  1.00000000 -0.33617445 -0.07022729
## visib      -0.06474770 -0.45766077 -0.33617445  1.00000000  0.17848803
## wind_dir   -0.01888189 -0.30772261 -0.07022729  0.17848803  1.00000000
## wind_speed  0.03304020 -0.15572301  0.01767296  0.04653220  0.17852744
## temp        0.09473431  0.07142219 -0.02406459  0.02821394 -0.10464847
## dewp        0.08817979  0.49954113  0.04886785 -0.14709823 -0.22306808
##             wind_speed        temp        dewp
## dep_delay   0.03304020  0.09473431  0.08817979
## humid      -0.15572301  0.07142219  0.49954113
## precip      0.01767296 -0.02406459  0.04886785
## visib       0.04653220  0.02821394 -0.14709823
## wind_dir    0.17852744 -0.10464847 -0.22306808
## wind_speed  1.00000000 -0.06833782 -0.12665952
## temp       -0.06833782  1.00000000  0.89586845
## dewp       -0.12665952  0.89586845  1.00000000

We can Plot the correlation data to see the relations between variables

corrplot(cor_delay_wc, method="color", type = "upper",
         tl.srt = 45,addCoef.col = "black")

#corrplot(cor_delay_wc, method = 'number')

According to correlation plots, there is no strong correlation between departure delay and any of the weather conditions variables and there is a tight competition between positive and negative correlations. To get more information we will visualize the relationship between delay and some of the variable such as temp and `precip.

fly_w_cd%>%
  group_by(precip)%>%
  summarise (dp_delay = mean(dep_delay, na.rm = TRUE))%>%
  ggplot(aes(precip, dp_delay))+
  geom_line()+
 
  geom_point()

As we can see from the graph, the precip doesn’t provide more information explanation about departure delay and the correlation between these two variables is weak.

fly_w_cd%>%
  group_by(temp)%>%na.omit()%>%
  summarise (dp_delay = mean(dep_delay, na.rm = TRUE))%>%
  ggplot(aes(temp, dp_delay))+
 # geom_line(col = "red")+
 
  geom_point()+
 geom_smooth(  se = FALSE, method = 'loess')

Surprisingly, can that the departure delay increase with temperature but this variation is not significant to provide more explanation about departure delay changes. In the next step I will build the linear regression model.

Modelling weather conditions delay model

set.seed(101)
#delay_weather_cd%>%
wc_delay_sample <- sample.split(delay_weather_cd$dep_delay, SplitRatio = 0.7)

# train set
train <- subset(delay_weather_cd, wc_delay_sample == TRUE)
head(train)

## # A tibble: 6 x 8
##   dep_delay humid precip visib wind_dir wind_speed  temp  dewp
##       <dbl> <dbl>  <dbl> <dbl>    <dbl>      <dbl> <dbl> <dbl>
## 1        -6 57.33      0    10      260   13.80936   4.4  -3.3
## 2        -3 57.33      0    10      260   13.80936   4.4  -3.3
## 3        -3 59.37      0    10      260   12.65858   3.9  -3.3
## 4        -2 59.37      0    10      260   12.65858   3.9  -3.3
## 5        -2 59.37      0    10      260   12.65858   3.9  -3.3
## 6        -1 57.33      0    10      260   13.80936   4.4  -3.3

#Test set
test <- subset(delay_weather_cd, wc_delay_sample == FALSE)
head(test)

## # A tibble: 6 x 8
##   dep_delay humid precip visib wind_dir wind_speed  temp  dewp
##       <dbl> <dbl>  <dbl> <dbl>    <dbl>      <dbl> <dbl> <dbl>
## 1        -5 59.37      0    10      270   10.35702   3.9  -3.3
## 2        -2 57.33      0    10      260   13.80936   4.4  -3.3
## 3        -2 59.37      0    10      260   12.65858   3.9  -3.3
## 4        -2 59.37      0    10      270   10.35702   3.9  -3.3
## 5         0 57.33      0    10      260   13.80936   4.4  -3.3
## 6         0 57.33      0    10      260   13.80936   4.4  -3.3

Build linear model

model <- lm(dep_delay~ ., data = train)
summary(model)

## 
## Call:
## lm(formula = dep_delay ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -163.36  -16.23  -11.24   -1.52 1297.92 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.6275534  2.7821737  16.759  < 2e-16 ***
## humid       -0.2886026  0.0281757 -10.243  < 2e-16 ***
## precip      91.4480263  6.2893796  14.540  < 2e-16 ***
## visib       -1.8495357  0.0683436 -27.062  < 2e-16 ***
## wind_dir    -0.0037708  0.0008162  -4.620 3.84e-06 ***
## wind_speed   0.1414720  0.0073312  19.297  < 2e-16 ***
## temp        -0.4577991  0.0963228  -4.753 2.01e-06 ***
## dewp         0.9032992  0.1037279   8.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.64 on 200090 degrees of freedom
## Multiple R-squared:  0.0172, Adjusted R-squared:  0.01716 
## F-statistic: 500.2 on 7 and 200090 DF,  p-value: < 2.2e-16

# Residuals

residus <- residuals(model)
residus <- as.data.frame(residus)
#head(residus)

residus%>%
ggplot(aes(residus)) +  geom_histogram(fill='blue',alpha=0.5, bins = 20)

Plotting model

Prediction computation

delay_predict <- predict(model, test)
predict_res  <- cbind(delay_predict, test$dep_delay)
colnames(predict_res) <- c("predict", "actual")
predict_res<- as.data.frame(predict_res)

 head(predict_res)

##    predict actual
## 1 6.678672     -5
## 2 7.564640     -2
## 3 7.041987     -2
## 4 6.678672     -2
## 5 7.564640      0
## 6 7.564640      0

 head(delay_predict)

##        1        2        3        4        5        6 
## 6.678672 7.564640 7.041987 6.678672 7.564640 7.564640

3.7 Exercise

Is the age of the plane associated to delay?

Here, we start by computing the age of each and store it in new column names age.

# This function compute the age the plane as integer.
year_diff <- function(x_year, y_year = as.integer(format(Sys.Date(), "%Y")))
{
   x_year <- as.integer(x_year)
  
    
 return(y_year - x_year)
 
}

pl_ages <- planes%>%
 mutate(age = year_diff(year))%>%
  select(tailnum, age, seats)
pl_ages <- pl_ages[complete.cases(pl_ages),]


flight_delay <- flights%>%
  select(year:day, origin, hour, arr_delay, dep_delay, tailnum)

#  join planes and flights datasets

join_fly_planes <- flight_delay%>%
  inner_join(pl_ages, by = "tailnum")
  #join_fly_planes
  
 fly_planes <-join_fly_planes%>%
   select(age,dep_delay,arr_delay)
 fly_planes <-fly_planes[complete.cases(fly_planes),]
 fly_planes

## # A tibble: 273,853 x 3
##      age dep_delay arr_delay
##    <int>     <dbl>     <dbl>
##  1    19         2        11
##  2    20         4        20
##  3    28         2        33
##  4     6        -1       -18
##  5    27        -6       -25
##  6     6        -4        12
##  7    18        -5        19
##  8    20        -3       -14
##  9    14        -3        -8
## 10     7        -2        -2
## # ... with 273,843 more rows

Now I will perform some plotting to visualize the graphic (age vs delay)

age_delay <- fly_planes%>%
  group_by(age)%>%
  filter(!is.na(dep_delay)) %>%
  summarise(delay = mean(dep_delay)) %>%
  ggplot(aes(x = age, y = delay)) +
 # geom_point() +
  geom_line() +
  labs(x = "Age of planes",
       y = "Mean of departure delay",
       title = "Graph dep_delay vs age")
 age_delay

#ggplotly(age_delay)

try correlation between age of planes and arrival delay.

cor_delay_age  <-cor(fly_planes)

cor_delay_age

##                   age   dep_delay   arr_delay
## age        1.00000000 -0.01513102 -0.01767153
## dep_delay -0.01513102  1.00000000  0.91674159
## arr_delay -0.01767153  0.91674159  1.00000000

According to the correlation table, There is a weak negative correlation between age and both arrival delay and departure delay. Meaning, the age of the planes can not explained the changes of delay.

Format the correlation data

	age	dep_delay	arr_delay
age	1.0000000	-0.0151310	-0.0176715
dep_delay	-0.0151310	1.0000000	0.9167416
arr_delay	-0.0176715	0.9167416	1.0000000

3.8 Exercise

It seems like the plane manufacturer could use a cleaning. After that, how many manufacturers have more than 200 planes? And how many flights are each manufacturer with more than 200 planes responsible for?

# Find list of manufacturers that have more than 200 planes.
manufact_200_planes <- planes%>%
  filter(!is.na(manufacturer)) %>%
group_by( manufacturer)%>%
  summarise(n_planes =n())%>%
  arrange(desc(n_planes))%>%
  filter(n_planes > 200)
#print list 
  manufact_200_planes%>%
    pander(big.mark = ',', justify = c('left', 'right'))

manufacturer	n_planes
BOEING	1,630
AIRBUS INDUSTRIE	400
BOMBARDIER INC	368
AIRBUS	336
EMBRAER	299

 # Combine the subset manufact_200_planes with flights dataset to find number of flights.
  
num_planes200 <- planes%>%
  inner_join(flights,"tailnum")%>% 
  filter(manufacturer %in%
           manufact_200_planes$manufacturer)%>% 
  
  group_by( manufacturer)%>%
  summarise(n_flight =n())%>%
  arrange(desc(n_flight))

  # print list
  num_planes200%>%
     pander(big.mark = ',', justify = c('left', 'right'))

manufacturer	n_flight
BOEING	82,912
EMBRAER	66,068
AIRBUS	47,302
AIRBUS INDUSTRIE	40,891
BOMBARDIER INC	28,272

 # format number of flights
  
 kable(num_planes200, "html") %>% # format correlation table.
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", "bordered"))

manufacturer	n_flight
BOEING	82912
EMBRAER	66068
AIRBUS	47302
AIRBUS INDUSTRIE	40891
BOMBARDIER INC	28272

3.9 Exercise

It turns out that Airbus has several main models, e.g. several A320 (A320-211, A320-212 etc.) and so on. Create a frequency table of the main models for Airbus and how many planes there are in each.

# Find number of model for "AIRBUS"

airbus_models <- planes%>% 
  group_by(model)%>%
  filter(manufacturer =="AIRBUS")%>%
  summarise(n_of_planes = n())%>%
  arrange(desc(n_of_planes))

# Split column model to find main models
#airbus_models%>%
  
  split_m <- strsplit( airbus_models$model, "-")

main_mdl <- vector("character", length = length(split_m))

for (m in 1: length(split_m)) {
  
  main_mdl[m] <- split_m[[m]][1]
  
}
# List of main models

m_models <- airbus_models%>%
  mutate(main_model =  main_mdl)

# m_models %>%
 # pander(big.mark = ',', justify = c('left','center', 'right'))

 # format list
  
 kable(m_models, "html") %>% # format correlation table.
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", "bordered"))

model	n_of_planes	main_model
A320-232	129	A320
A320-214	61	A320
A321-231	49	A321
A319-114	31	A319
A321-211	18	A321
A319-112	14	A319
A330-243	14	A330
A319-131	5	A319
A319-132	5	A319
A319-115	3	A319
A320-211	2	A320
A320-212	2	A320
A330-223	2	A330
A330-323	1	A330

# A Frequency table of main models

m_models%>%
  group_by(main_model)%>%
  summarise( num_model = n(), num_of_planes = sum(n_of_planes))%>%
pander(big.mark = ',', justify = c('left','center', 'right'))

main_model	num_model	num_of_planes
A319	5	58
A320	4	194
A321	2	67
A330	3	17

3.10 Exercise

Are larger planes (measured by number of seats) more or less delayed than smaller planes? To answer this question one can visualize the plot (seats vs arr_delay)

delay_seats <- join_fly_planes%>%
  group_by(seats)%>%
  summarise(mean_delay = mean(arr_delay, na.rm = TRUE))%>%
  arrange(desc(seats))

#delay_seats%>%
#pander(big.mark = ',', justify = c('left', 'right'))

kable(delay_seats, "html") %>%
  kable_styling() %>%
  scroll_box(width = "500px", height = "600px")

seats	mean_delay
450	120.0000000
400	14.3333333
379	3.2904795
377	-6.6354839
330	5.2810127
300	24.2000000
292	6.2159091
290	7.7356322
275	11.9455645
269	-4.7500000
260	-4.5000000
255	-1.1056853
222	-8.8192771
200	7.5991997
199	2.6828016
191	0.8033501
189	-1.3370618
182	3.2215930
179	4.2978482
178	2.2861217
172	-0.8459043
149	3.3884528
147	33.4731183
145	0.3995519
142	5.9049074
140	9.6505463
139	2.3406593
128	-3.0526316
102	3.5000000
100	17.7885978
95	8.0389666
80	8.2470027
55	15.8059566
22	7.1729958
20	8.0563663
16	-4.2950820
14	-1.3461538
12	-10.2500000
11	22.4347826
10	5.6666667
9	4.0454545
8	7.8216561
7	-3.0338983
6	10.6735395
5	7.1370717
4	6.3654917
2	5.4972527

According to this graph, there is no sign that the larger planes are more delayed than the smaller ones.

One can also try to compute the correlation between age, arr_delay and dep_delay.

	seats	arr_delay	dep_delay
seats	1.0000000	-0.0726318	-0.0541665
arr_delay	-0.0726318	1.0000000	0.9167416
dep_delay	-0.0541665	0.9167416	1.0000000

There is a weak negative correlation between a number of seats and delay meaning that the number of seats (smaller/ larger) can not explain the arrival delay.

## 
## Call:
## lm(formula = seat_delay$seats ~ seat_delay$arr_delay + seat_delay$dep_delay)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -145.48  -75.35   10.74   50.37  351.51 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          137.892496   0.151073  912.75   <2e-16 ***
## seat_delay$arr_delay  -0.229291   0.007595  -30.19   <2e-16 ***
## seat_delay$dep_delay   0.137780   0.008443   16.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 71.33 on 273850 degrees of freedom
## Multiple R-squared:  0.006242,   Adjusted R-squared:  0.006234 
## F-statistic:   860 on 2 and 273850 DF,  p-value: < 2.2e-16

##          (Intercept) seat_delay$arr_delay seat_delay$dep_delay 
##          137.8924960           -0.2292909            0.1377797

3.11 Exercise

On a map (map_data("usa")), plot the airports that has flights to them.

usa <- map_data("usa")


#airports$faa <- as.factor(airports$faa)

fli_dest <- flights%>%
  group_by(dest)%>%
  summarise(n = n())%>%
  inner_join(airports, by =c(dest = "faa"))

## Warning: Column `dest`/`faa` joining factor and character vector, coercing
## into character vector

fli_dest

## # A tibble: 101 x 9
##     dest     n                              name      lat        lon   alt
##    <chr> <int>                             <chr>    <dbl>      <dbl> <int>
##  1   ABQ   254 Albuquerque International Sunport 35.04022 -106.60919  5355
##  2   ACK   265                     Nantucket Mem 41.25305  -70.06018    48
##  3   ALB   439                       Albany Intl 42.74827  -73.80169   285
##  4   ANC     8        Ted Stevens Anchorage Intl 61.17436 -149.99636   152
##  5   ATL 17215   Hartsfield Jackson Atlanta Intl 33.63672  -84.42807  1026
##  6   AUS  2439             Austin Bergstrom Intl 30.19453  -97.66989   542
##  7   AVL   275        Asheville Regional Airport 35.43619  -82.54181  2165
##  8   BDL   443                      Bradley Intl 41.93889  -72.68322   173
##  9   BGR   375                       Bangor Intl 44.80744  -68.82814   192
## 10   BHM   297                   Birmingham Intl 33.56294  -86.75355   644
## # ... with 91 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

Make a similar plot, but now points must have size relative to the number of flights each airport is destination for.

map_data <- ggplot() + 
  geom_polygon(data = usa, aes(long, lat, group = group), fill = "grey") +

geom_point(data =fli_dest, aes(x = lon, y = lat, size = n , color = n)) + 
  borders("state")+
  scale_color_viridis()+
  
  labs(x = "longitude",
       y = "latitude",
       title = "map flights for destinations")+
coord_quickmap()
map_data

#ggplotly(map_data)

3.12 Exercise

Do a principal component analysis of the weather at JFK using the following columns: temp, dewp, humid, wind_dir, wind_speed, precip, visib (only on complete.cases()). How many principal components should be used to capture the variability in the weather data?

weather_dc <-  mutate_at(weather, vars(temp, dewp),funs(fahrenheit_to_celcius))

weather_jfk_df <- weather_dc%>%
  group_by(origin)%>%
filter(origin == "JFK") %>%
   select(origin,temp, dewp, humid, wind_dir, wind_speed, precip, visib)

head(weather_jfk_df)

## # A tibble: 6 x 8
## # Groups:   origin [1]
##   origin  temp  dewp humid wind_dir wind_speed precip visib
##    <chr> <dbl> <dbl> <dbl>    <dbl>      <dbl>  <dbl> <dbl>
## 1    JFK   3.3  -5.0 54.51      240   16.11092      0    10
## 2    JFK   3.3  -4.4 57.04      250   17.26170      0    10
## 3    JFK   3.9  -3.9 56.77      240   19.56326      0    10
## 4    JFK   3.9  -3.3 59.37      240   18.41248      0    10
## 5    JFK   3.9  -3.9 56.77      260   14.96014      0    10
## 6    JFK   3.9  -3.3 59.37      260   12.65858      0    10

# Here we consider the completecases

weather_jfk<- weather_jfk_df[complete.cases(weather_jfk_df),]
 
weather_jfk <-  weather_jfk[, sapply(weather_jfk, is.numeric)]

summary and correlation plot

Principal component analysis PCA

## # A tibble: 6 x 7
##    temp  dewp humid wind_dir wind_speed precip visib
##   <dbl> <dbl> <dbl>    <dbl>      <dbl>  <dbl> <dbl>
## 1   3.3  -5.0 54.51      240   16.11092      0    10
## 2   3.3  -4.4 57.04      250   17.26170      0    10
## 3   3.9  -3.9 56.77      240   19.56326      0    10
## 4   3.9  -3.3 59.37      240   18.41248      0    10
## 5   3.9  -3.9 56.77      260   14.96014      0    10
## 6   3.9  -3.3 59.37      260   12.65858      0    10

##       temp             dewp             humid           wind_dir    
##  Min.   :-11.10   Min.   :-23.300   Min.   : 15.21   Min.   :  0.0  
##  1st Qu.:  4.40   1st Qu.: -2.800   1st Qu.: 49.19   1st Qu.:130.0  
##  Median : 12.20   Median :  6.000   Median : 65.80   Median :220.0  
##  Mean   : 12.43   Mean   :  5.441   Mean   : 65.11   Mean   :202.9  
##  3rd Qu.: 20.60   3rd Qu.: 14.400   3rd Qu.: 82.45   3rd Qu.:290.0  
##  Max.   : 36.70   Max.   : 25.600   Max.   :100.00   Max.   :360.0  
##    wind_speed         precip             visib       
##  Min.   : 0.000   Min.   :0.000000   Min.   : 0.000  
##  1st Qu.: 6.905   1st Qu.:0.000000   1st Qu.:10.000  
##  Median :10.357   Median :0.000000   Median :10.000  
##  Mean   :11.390   Mean   :0.002731   Mean   : 9.125  
##  3rd Qu.:14.960   3rd Qu.:0.000000   3rd Qu.:10.000  
##  Max.   :42.579   Max.   :0.530000   Max.   :10.000

## Importance of components%s:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.6176 1.2088 1.0637 0.8780 0.78271 0.63604 0.04986
## Proportion of Variance 0.3738 0.2087 0.1616 0.1101 0.08752 0.05779 0.00036
## Cumulative Proportion  0.3738 0.5826 0.7442 0.8543 0.94185 0.99964 1.00000

Accordingly, to the different plots and summary table, one can see that more than 94% of weather variability can be captured by 5 PCs. So we choose the 5 first components as principal ones.

Miniproject in Introduction to Data Science ITVEST Data Science and Big Data (DSBD)

Youssouf Souare

December 28, 2017