1.Load all the required libraries

library(tidyverse)
library(plotly)

2.Read CSV from github

df <- read_csv("https://raw.githubusercontent.com/qixing810/CUNYSPS-DataScience/master/DS607/dataset/w5.csv")
df
## # A tibble: 5 x 7
##   X1     X2      `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>  <chr>           <int>   <dbl>       <int>           <int>   <dbl>
## 1 ALASKA on time           497     221         212             503    1841
## 2 <NA>   delayed            62      12          20             102     305
## 3 <NA>   <NA>               NA      NA          NA              NA      NA
## 4 AMWEST on time           694    4840         383             320     201
## 5 <NA>   delayed           117     415          65             129      61

3. Clean the dataset

# remove the null row
df <- df[-3,]

# replace the remain null value
df[2,1] <- "ALASKA"
df[4,1] <- "AMWEST"

# add column names for the first two columns
colnames(df)[1] <- "Airline"
colnames(df)[2] <- "Status"

df
## # A tibble: 4 x 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <int>   <dbl>       <int>           <int>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AMWEST  on time           694    4840         383             320     201
## 4 AMWEST  delayed           117     415          65             129      61

4. Spread and gather dataset

# gather the cities into a single column destination
# spread the value in status column to separate columns
df1 <- df %>%
  gather('Los Angeles','Phoenix', 'San Diego', 'San Francisco', 'Seattle',
         key = 'Destination',value = 'count') %>%
  spread(Status,count)
df1
## # A tibble: 10 x 4
##    Airline Destination   delayed `on time`
##    <chr>   <chr>           <dbl>     <dbl>
##  1 ALASKA  Los Angeles        62       497
##  2 ALASKA  Phoenix            12       221
##  3 ALASKA  San Diego          20       212
##  4 ALASKA  San Francisco     102       503
##  5 ALASKA  Seattle           305      1841
##  6 AMWEST  Los Angeles       117       694
##  7 AMWEST  Phoenix           415      4840
##  8 AMWEST  San Diego          65       383
##  9 AMWEST  San Francisco     129       320
## 10 AMWEST  Seattle            61       201

5. Data Analysis

  • Compare the arrival delays of the two airlines
airline_delay<- df %>%
  mutate(total = rowSums(df[,4:7])) %>%
  ggplot(aes(Airline,total))+
  geom_bar(stat = 'identity',aes(fill = Status), position='dodge')+
  labs(title = "Arrival delays & on time")+
  xlab("Airlines")+
  ylab("Number of flights")
ggplotly(airline_delay)
  • Compare the arrival delays of the four cities
city_delay <- df1 %>%
  group_by(Destination,Airline) %>%
  summarise(total = sum(delayed)) %>%
  ggplot(aes(Destination,total))+
  geom_bar(stat = 'identity',aes(fill = Airline))+  
  labs(title = "Arrival delays")+
  xlab("Destination")+
  ylab("Number of delays")
ggplotly(city_delay)