Reinaldo Zezela University of Derby, UK r.zezela1@unimail.derby.ac.uk May 2018
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
- Looding the flights On-time Performance dataset.
- Create Data Frame: CARRIER
CODE = c( "AA", "AS", "B6", "DL", "EV", "F9", "HA", "NK", "OO", "UA", "VX", "WN" )
DESCRIPTION <- c("American Airlines Inc.", "Alaska Airlines Inc.", "JetBlue Airways", "Delta Air Lines Inc.", "ExpressJet Airlines Inc.", "Frontier Airlines Inc.", "Hawaiian Airlines Inc.", "Spirit Air Lines", "SkyWest Airlines Inc.", "United Air Lines Inc.", "Virgin America", "Southwest Airlines Co.")
CARRIER <- data.frame(CODE, DESCRIPTION)
CARRIER
2.0 Exploratory Data Analysis ———————————————–
- The Average Arrival Delays (Minutes) by Airline in 2017
# Prepare data: group mean Arrival delay by Airline in 2017
tbl_avg <- FLIGHTS %>%
group_by(CARRIER) %>%
summarise(avg_Arr_delay = round(mean(ARR_DELAY_NEW, na.rm = TRUE), digits=2)) %>%
arrange(.$avg_Arr_delay)
head(tbl_avg)
** Merge the subset(tbl_avg) with CARRIER table (data frame) in order to get the Airline name
#CARRIER
head(m1)
*Plotting average arrival delay by Airline
ggplot(m1, aes(x=reorder(DESCRIPTION,avg_Arr_delay), y=avg_Arr_delay)) +
geom_bar(stat="identity", fill="tomato") +
xlab("Airline") +
ylab("Average Arrival Delay") +
theme(axis.text.x = element_text(angle=90))

3.0 Cause of Delay
*3.1 Cumulative minutes delayed by cause of delay
*Plotting Causes of delay
ggplot(tbl_delay_type, aes(x=MONTH)) +
geom_line(aes(y=CARRIER_DELAY, color = "darkred")) +
geom_line(aes(y=WEATHER_DELAY, color="steelblue"), linetype="twodash") +
geom_line(aes(y=NAS_DELAY, color = "#999999")) +
geom_line(aes(y=SECURITY_DELAY, color = "#E69F00")) +
geom_line(aes(y=LATE_AIRCRAFT_DELAY, color = "black")) +
#scale_fill_discrete(name="Cause of Delay", labels=c("CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY" )) +
scale_y_continuous(name ="Cumulative minutes delayed by cause of delay", labels = scales::comma ) +
scale_color_discrete(name = "Cause of Delay", labels=c("CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY" )) +
theme_minimal()

5.General Delay Trends

5.1. Plotting facet_wrap - General Delay Trends

6.Heatmap

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
---
title: 'Data Analysis Airline On-time Performance '
output:
  html_notebook: default
  html_document: default
  pdf_document: default
  word_document: default
---
Reinaldo Zezela 
University of Derby, UK
r.zezela1@unimail.derby.ac.uk 
May 2018


This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 


* Looding the flights On-time Performance dataset.

```{r}
library(dplyr)
library(ggplot2)
library(ggthemes)
library(lubridate)

FLIGHTS <- read.csv("FLIGHTS.csv")

head(FLIGHTS)

```

* Create Data Frame: CARRIER

```{r}
CODE = c( "AA", "AS", "B6", "DL", "EV", "F9", "HA", "NK", "OO", "UA", "VX", "WN" )
  DESCRIPTION <- c("American Airlines Inc.", "Alaska Airlines Inc.", "JetBlue Airways", "Delta Air Lines Inc.", "ExpressJet Airlines Inc.", "Frontier Airlines Inc.", "Hawaiian Airlines Inc.", "Spirit Air Lines", "SkyWest Airlines Inc.", "United Air Lines Inc.", "Virgin America", "Southwest Airlines Co.")
  CARRIER <- data.frame(CODE, DESCRIPTION)

  CARRIER

```

  
#2.0 Exploratory Data Analysis -----------------------------------------------

* The Average Arrival Delays (Minutes) by Airline in 2017

```{r}
# Prepare data: group mean Arrival delay by Airline in 2017
tbl_avg <- FLIGHTS %>%
  group_by(CARRIER) %>%
summarise(avg_Arr_delay = round(mean(ARR_DELAY_NEW, na.rm = TRUE), digits=2)) %>%
  arrange(.$avg_Arr_delay)

head(tbl_avg)
```


** Merge the subset(tbl_avg) with CARRIER table (data frame) in order to get the Airline name
```{r}
#CARRIER

m1 <- merge(tbl_avg, CARRIER, by.x ="CARRIER", by.y = "CODE" )

head(m1)
```

*Plotting average arrival delay by Airline

```{r}
# plot average arrival delay by Airline - Solution2
ggplot(m1, aes(x=reorder(DESCRIPTION,avg_Arr_delay), y=avg_Arr_delay)) +
  geom_bar(stat="identity", fill="tomato") +
  geom_text(aes(label=avg_Arr_delay),vjust=1.6, color="white", size=3.5) +
  xlab("Airline") +
  ylab("Average Arrival Delay") +
  theme(axis.text.x = element_text(angle=90))
```



# 3.0	Cause of Delay

*3.1 Cumulative minutes delayed by cause of delay

```{r}
FLIGHTS$FL_DATE <- as.Date(FLIGHTS$FL_DATE,"%d/%m/%Y") 

head(FLIGHTS)

tbl_delay_type <- FLIGHTS %>%
  mutate(MONTH = floor_date(FL_DATE,"month")) %>%
  group_by(MONTH) %>%
  summarise(CARRIER_DELAY=sum(CARRIER_DELAY,na.rm = TRUE), WEATHER_DELAY=sum(WEATHER_DELAY,na.rm = TRUE), NAS_DELAY=sum(NAS_DELAY,na.rm = TRUE), SECURITY_DELAY=sum(SECURITY_DELAY,na.rm = TRUE), LATE_AIRCRAFT_DELAY=sum(LATE_AIRCRAFT_DELAY,na.rm = TRUE)) 


#colnames(tbl_delay_type) <- c("MONTH","CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY" )

head(tbl_delay_type)
```


*Plotting Causes of delay
```{r}
ggplot(tbl_delay_type, aes(x=MONTH)) +
  geom_line(aes(y=CARRIER_DELAY, color = "darkred")) +
  geom_line(aes(y=WEATHER_DELAY, color="steelblue"), linetype="twodash") +
  geom_line(aes(y=NAS_DELAY, color = "#999999")) +
  geom_line(aes(y=SECURITY_DELAY, color = "#E69F00")) +  
  geom_line(aes(y=LATE_AIRCRAFT_DELAY, color = "black")) +
  #scale_fill_discrete(name="Cause of Delay", labels=c("CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY" )) +
  scale_y_continuous(name ="Cumulative minutes delayed by cause of delay", labels = scales::comma ) +
  scale_color_discrete(name = "Cause of Delay", labels=c("CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY" )) +
  theme_minimal()
```


#4.0 ON TIME PERFORMANCE: Arrived flights


ON TIME PERFORMANCE: In 2017, 79.28 percent of the flights at US airports arrived within 15 minutes of the scheduled arrival time.

Percent of Arrival within 15 minutes of schedule

```{r}
library(ggrepel)

df <- FLIGHTS %>%
  filter(!is.na(DELAY_STATUS)) %>%
  group_by(CARRIER)%>%
  summarise(Num_flights=n(),
            Num_Ontime=sum(DELAY_STATUS==0),
            PCT= round(Num_Ontime/Num_flights,4)*100)


ref_avg <- df %>%
  summarise(average=mean(PCT)) %>%
  .$average

ref_avg <- round(ref_avg,2)

df$delay_type <- ifelse(df$PCT < ref_avg, "below", "above")  # above / below avg flag

m2 <- merge(df, CARRIER, by.x ="CARRIER", by.y = "CODE" )

# Plot - solution 3
ggplot(m2, aes(x=Num_flights, y=PCT, label= DESCRIPTION , color=delay_type)) +
  geom_point(aes(size=Num_flights)) +
  #geom_text(hjust = 0.2, vjust=-0.8) +
  geom_text_repel() +
  #scale_x_log10() +
  #scale_y_log10() +
  scale_x_continuous(name ="Number of flights", labels = scales::comma ) +
  #scale_y_continuous(name ="Percent of Arrival within 15 minutes" )
  #xlab("Number of flights") +
  ylab("Percent of Arrival within 15 minutes") +
  ggtitle("In 2017, 79.28% of the flights arrived within 15 minutes.") +
  geom_hline(aes(yintercept=ref_avg), lty=2) +
  scale_color_discrete(name="Delay_Type") +
  theme_minimal()
```



#5.General Delay Trends 

```{r}
df <- FLIGHTS%>%
  group_by(FL_DATE, DAY_OF_WEEK, CARRIER)%>%
  summarise(AVG_DEP_DELAY= mean(DEP_DELAY_NEW,na.rm=TRUE))

df$FL_DATE <- as.Date(df$FL_DATE, "%d/%m/%Y")

require(scales)

ggplot(df, aes(x=FL_DATE, y=AVG_DEP_DELAY)) +
  geom_point() + 
  #stat_smooth(method = "loess") +
  geom_smooth(color = 'Red', method = 'loess', se=FALSE) +
  #facet_wrap(~ CARRIER) +
  scale_x_date(breaks = date_breaks("months"),
               labels = date_format("%b")) +
  theme(axis.text.x = element_text(angle=90))
```

##5.1. Plotting facet_wrap - General Delay Trends 

```{r}

m_TREND_DELAY <- merge(df,CARRIER, by.x = "CARRIER", by.y = "CODE")

#head(m_TREND_DELAY)


# Plotting Airline name - Genetal Trend Delay

ggplot(m_TREND_DELAY, aes(x=FL_DATE, y=AVG_DEP_DELAY)) +
  geom_point() + 
  #stat_smooth(method = "loess") +
  geom_smooth(color = 'Red', method = 'loess', se=FALSE) +
  facet_wrap(~ DESCRIPTION) +
  scale_x_date(breaks = date_breaks("months"),
               labels = date_format("%b")) +
  theme(axis.text.x = element_text(angle=90))
```





#6.Heatmap

```{r}
df <- FLIGHTS %>%
  group_by(FL_DATE,CARRIER,DEP_TIME_BLK,DAY_OF_WEEK) %>%
  summarise(AVG_DEP_DELAY=mean(DEP_DELAY_NEW,na.rm=TRUE))

df$FL_DATE <- as.Date(df$FL_DATE, "%d/%m/%Y") 
df$CARRIER <- as.factor(df$CARRIER)
df$DAY_OF_WEEK <- as.factor(df$DAY_OF_WEEK)
df$DEP_TIME_BLK <- as.factor(df$DEP_TIME_BLK)

# Plot
ggplot(df, aes(x=DEP_TIME_BLK,y=DAY_OF_WEEK, fill=AVG_DEP_DELAY )) +
  geom_tile(color = "white") +
  scale_fill_gradient(low="#00AFBB", high="black") +
  #scale_fill_gradient(low="#00AFBB", high="#FC4E07") +
  #scale_color_gradient2(midpoint = mid, low = "blue", mid = "white", high = "red", space = "Lab") +
  #facet_grid(CARRIER ~ as.factor(format(df$FL_DATE, "%b")) ) +
  theme(axis.text.x = element_text(angle=90))
```


















Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).
