1.Load all the required libraries
library(tidyverse)
library(plotly)
2.Read CSV from github
df <- read_csv("https://raw.githubusercontent.com/qixing810/CUNYSPS-DataScience/master/DS607/dataset/w5.csv")
df
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <int> <dbl> <int> <int> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
3. Clean the dataset
# remove the null row
df <- df[-3,]
# replace the remain null value
df[2,1] <- "ALASKA"
df[4,1] <- "AMWEST"
# add column names for the first two columns
colnames(df)[1] <- "Airline"
colnames(df)[2] <- "Status"
df
## # A tibble: 4 x 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <int> <dbl> <int> <int> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
4. Spread and gather dataset
# gather the cities into a single column destination
# spread the value in status column to separate columns
df1 <- df %>%
gather('Los Angeles','Phoenix', 'San Diego', 'San Francisco', 'Seattle',
key = 'Destination',value = 'count') %>%
spread(Status,count)
df1
## # A tibble: 10 x 4
## Airline Destination delayed `on time`
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San Diego 65 383
## 9 AMWEST San Francisco 129 320
## 10 AMWEST Seattle 61 201
5. Data Analysis
- Compare the arrival delays of the two airlines
airline_delay<- df %>%
mutate(total = rowSums(df[,4:7])) %>%
ggplot(aes(Airline,total))+
geom_bar(stat = 'identity',aes(fill = Status), position='dodge')+
labs(title = "Arrival delays & on time")+
xlab("Airlines")+
ylab("Number of flights")
ggplotly(airline_delay)
- Compare the arrival delays of the four cities
city_delay <- df1 %>%
group_by(Destination,Airline) %>%
summarise(total = sum(delayed)) %>%
ggplot(aes(Destination,total))+
geom_bar(stat = 'identity',aes(fill = Airline))+
labs(title = "Arrival delays")+
xlab("Destination")+
ylab("Number of delays")
ggplotly(city_delay)