library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)
Airlines <- read.csv("https://raw.githubusercontent.com/uplotnik/Data607/master/Airlines.csv", na.strings = c("", "NA"),
sep = ",", header = TRUE)
Airlines
## X X.1 Los.Angeles Phonexi San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 21 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
names(Airlines)[1] <- "Carrier"
names(Airlines)[2] <- "Flight_status"
Airlines
## Carrier Flight_status Los.Angeles Phonexi San.Diego San.Francisco
## 1 ALASKA on time 497 221 21 503
## 2 <NA> delayed 62 12 20 102
## 3 <NA> <NA> NA NA NA NA
## 4 AM WEST on time 694 4840 383 320
## 5 <NA> delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 3 NA
## 4 201
## 5 61
Airlines <- Airlines %>% fill(Carrier)
Airlines
## Carrier Flight_status Los.Angeles Phonexi San.Diego San.Francisco
## 1 ALASKA on time 497 221 21 503
## 2 ALASKA delayed 62 12 20 102
## 3 ALASKA <NA> NA NA NA NA
## 4 AM WEST on time 694 4840 383 320
## 5 AM WEST delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 3 NA
## 4 201
## 5 61
We can gather each city within one column variable and also gather the values associated with each city in a second column variable.
Airlines <- Airlines %>% gather(City, Number, Los.Angeles:Seattle)
head(Airlines)
## Carrier Flight_status City Number
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 ALASKA <NA> Los.Angeles NA
## 4 AM WEST on time Los.Angeles 694
## 5 AM WEST delayed Los.Angeles 117
## 6 ALASKA on time Phonexi 221
Airline <- spread(Airlines,"Flight_status", "Number")
head(Airline)
## Carrier City delayed on time <NA>
## 1 ALASKA Los.Angeles 62 497 NA
## 2 ALASKA Phonexi 12 221 NA
## 3 ALASKA San.Diego 20 21 NA
## 4 ALASKA San.Francisco 102 503 NA
## 5 ALASKA Seattle 305 1841 NA
## 6 AM WEST Los.Angeles 117 694 NA
(in order to avoid errors with future manipulations)
Airline<- dplyr::rename(Airline, 'Timely'='on time')
Airline
## Carrier City delayed Timely <NA>
## 1 ALASKA Los.Angeles 62 497 NA
## 2 ALASKA Phonexi 12 221 NA
## 3 ALASKA San.Diego 20 21 NA
## 4 ALASKA San.Francisco 102 503 NA
## 5 ALASKA Seattle 305 1841 NA
## 6 AM WEST Los.Angeles 117 694 NA
## 7 AM WEST Phonexi 415 4840 NA
## 8 AM WEST San.Diego 65 383 NA
## 9 AM WEST San.Francisco 129 320 NA
## 10 AM WEST Seattle 61 201 NA
## Ave delay for each city
delay_avg <- Airline %>%
group_by(City) %>%
summarise(Ave_City_Delay= round(mean(delayed, na.rm = TRUE), digits=5)) %>%
##summarise(Ave_City_Timely= round(mean(Timely, na.rm = TRUE), digits=2))%>%
arrange(.$Ave_City_Delay)
delay_avg
## # A tibble: 5 x 2
## City Ave_City_Delay
## <chr> <dbl>
## 1 San.Diego 42.5
## 2 Los.Angeles 89.5
## 3 San.Francisco 116.
## 4 Seattle 183
## 5 Phonexi 214.
## Ave arrival on time for each city
Timely_avg <- Airline %>%
group_by(City) %>%
summarise(Ave_City_Timely= round(mean(Timely, na.rm = TRUE), digits=5)) %>%
arrange(.$Ave_City_Timely)
Timely_avg
## # A tibble: 5 x 2
## City Ave_City_Timely
## <chr> <dbl>
## 1 San.Diego 202
## 2 San.Francisco 412.
## 3 Los.Angeles 596.
## 4 Seattle 1021
## 5 Phonexi 2530.
City_tab <- left_join(delay_avg, Timely_avg, by = c('City'))%>%
arrange(.$City)
City_tab
## # A tibble: 5 x 3
## City Ave_City_Delay Ave_City_Timely
## <chr> <dbl> <dbl>
## 1 Los.Angeles 89.5 596.
## 2 Phonexi 214. 2530.
## 3 San.Diego 42.5 202
## 4 San.Francisco 116. 412.
## 5 Seattle 183 1021
## Find average of delays for each Airline
delay_avg_carrier <- Airline %>%
group_by(Carrier) %>%
summarise(Ave_delayed= round(mean(delayed, na.rm = TRUE), digits=2)) %>%
arrange(.$Ave_delayed)
delay_avg_carrier
## # A tibble: 2 x 2
## Carrier Ave_delayed
## <fct> <dbl>
## 1 ALASKA 100.
## 2 AM WEST 157.
timely_avg_carrier <- Airline %>%
group_by(Carrier) %>%
summarise(Ave_timely= round(mean(Timely, na.rm = TRUE), digits=5)) %>%
arrange(.$Ave_timely)
head(timely_avg_carrier)
## # A tibble: 2 x 2
## Carrier Ave_timely
## <fct> <dbl>
## 1 ALASKA 617.
## 2 AM WEST 1288.
Carrier1 <- left_join(delay_avg_carrier, timely_avg_carrier, by = c('Carrier'))%>%
arrange(.$Carrier)
Carrier1
## # A tibble: 2 x 3
## Carrier Ave_delayed Ave_timely
## <fct> <dbl> <dbl>
## 1 ALASKA 100. 617.
## 2 AM WEST 157. 1288.
Airlines1 <- Carrier1 %>% gather(Ave_status, Flights, Ave_delayed:Ave_timely)
head(Airlines1)
## # A tibble: 4 x 3
## Carrier Ave_status Flights
## <fct> <chr> <dbl>
## 1 ALASKA Ave_delayed 100.
## 2 AM WEST Ave_delayed 157.
## 3 ALASKA Ave_timely 617.
## 4 AM WEST Ave_timely 1288.
ggplot(
Airlines1, aes(x = Carrier, y = Flights,fill=Ave_status)) +
geom_bar(stat="identity") +
ggtitle("Airlines: AVE DELAY vs AVE ON Time")+
theme(axis.text=element_text(angle=90))+
labs(x="City",y="Flights")
ggplot(
City_tab, aes(x = City, y = Ave_City_Timely ,fill=Ave_City_Timely)) +
geom_bar(stat="identity") +
ggtitle("City: Ave ON TIME")+
theme(axis.text=element_text(angle=90))+
labs(x="City",y="Ave_City_Timely")