Data607 HW5

Loading the required packages

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

AlaskaAM<-read.csv("AlaskaAM.csv", header=TRUE, sep=",")
head(AlaskaAM)

##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Renaming the first two columns X,X.1 to Airline and Status

Filter out the blank row separating the two airlines

AlaskaAM2<-AlaskaAM %>% 
  rename(Airline=X, Status=X.1) %>% 
  filter(Status!="")

Fill the blank row with the correct Airline

AlaskaAM2$Airline<-as.character(AlaskaAM2$Airline)
AlaskaAM2$Airline[AlaskaAM2$Airline==""]<-c("ALASKA", "AM WEST")
AlaskaAM2

##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Gather the cities under one column City and number of flights column under Number_Flights

Split the Number_Flights column into two delayed and on time.

tidydata1<-AlaskaAM2 %>% 
  gather(City, Number_Flights, Los.Angeles:Seattle) 
tidydata2<-tidydata1%>% 
  spread(Status, Number_Flights)
tidydata2

##    Airline          City delayed on time
## 1   ALASKA   Los.Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San.Diego      20     212
## 4   ALASKA San.Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los.Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San.Diego      65     383
## 9  AM WEST San.Francisco     129     320
## 10 AM WEST       Seattle      61     201

##Group the tidydata by Airline and then use summarise function to compute summaries of interest

tidydata2 %>% 
  group_by(Airline) %>% 
  summarise(Min_Delays=min(delayed),
            Max_Delays=max(delayed),
            Avg_Delays=mean(delayed),
            Total_Delays=sum(delayed))

## # A tibble: 2 x 5
##   Airline Min_Delays Max_Delays Avg_Delays Total_Delays
##   <chr>        <int>      <int>      <dbl>        <int>
## 1 ALASKA          12        305       100.          501
## 2 AM WEST         61        415       157.          787

Using the summarise function we can calculate the total delays of both airlines. We can tell from the total delays what AM West have more delays compared to Alaska Airline. We also noticed AM West have majority of the delays in the city Phoenix.

Let’s put this in a graph because I love graphs.

Graph Analysis

ggplot(data = tidydata1, aes(x=Airline,y=Number_Flights))+
  #geom_point(alpha = 0.5, size = 5, color ='blue')
  geom_bar(stat = 'identity',aes(fill=Airline))+
  geom_text(aes(x = Airline, y = Number_Flights, 
                label = paste(Number_Flights),
                group = Airline,
                vjust = -0.4)) +
  labs(title = "Delays of Airlines & City", 
       x = "Airline", 
       y = "Delay Flight Count") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()