Libaries used: library(knitr) library(png) library(tidyr) library(dplyr)
The chart below describes arrival delays for two airlines across five destinations.
Your task is to:
You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
Perform analysis to compare the arrival delays for the two airlines.
Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.
imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/4.png"
include_graphics(imgage)
flight_times <- rbind(c(NA, NA, "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
c(NA, "Delayed", 62, 12, 20, 102, 305),
c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
c(NA, "Delayed", 117, 415, 65, 129, 61))
write.csv(flight_times,'C:/Users/jpsim/Documents/DATA Acquisition and Management/flight_times.csv', row.names = FALSE)
raw_data <- read.csv(paste0("https://raw.githubusercontent.com/josephsimone/DATA607/master/flight_times.csv"),
stringsAsFactors = F); raw_data
## NA. NA..1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA On Time 497 221 212 503 1841
## 2 <NA> Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 <NA> Delayed 117 415 65 129 61
clean_data <- raw_data %>%
mutate(Airlines1 = NA., Airlines2=lag(NA.)) %>%
mutate(Airline = coalesce(Airlines1,Airlines2), Status = NA..1) %>%
gather("Destination", "Flights", 3:7) %>%
select(Airline:Flights) %>%
arrange(Airline, desc(Status), Destination); clean_data
## Airline Status Destination Flights
## 1 ALASKA On Time Los.Angeles 497
## 2 ALASKA On Time Phoenix 221
## 3 ALASKA On Time San.Diego 212
## 4 ALASKA On Time San.Francisco 503
## 5 ALASKA On Time Seattle 1841
## 6 ALASKA Delayed Los.Angeles 62
## 7 ALASKA Delayed Phoenix 12
## 8 ALASKA Delayed San.Diego 20
## 9 ALASKA Delayed San.Francisco 102
## 10 ALASKA Delayed Seattle 305
## 11 AM WEST On Time Los.Angeles 694
## 12 AM WEST On Time Phoenix 4840
## 13 AM WEST On Time San.Diego 383
## 14 AM WEST On Time San.Francisco 320
## 15 AM WEST On Time Seattle 201
## 16 AM WEST Delayed Los.Angeles 117
## 17 AM WEST Delayed Phoenix 415
## 18 AM WEST Delayed San.Diego 65
## 19 AM WEST Delayed San.Francisco 129
## 20 AM WEST Delayed Seattle 61
flights_aggr_df <- clean_data %>% select(Airline, Status, Flights) %>%
group_by(Airline, Status) %>%
summarise(total = sum(Flights)) %>%
mutate(percent = round(total/sum(total) * 100, 2),
percent_formatted = paste0(round(total/sum(total) * 100, 1), "%"),
total_flights = sum(total))
tidy_df <- as.data.frame(flights_aggr_df)
tidy_df
## Airline Status total percent percent_formatted total_flights
## 1 ALASKA Delayed 501 13.27 13.3% 3775
## 2 ALASKA On Time 3274 86.73 86.7% 3775
## 3 AM WEST Delayed 787 10.89 10.9% 7225
## 4 AM WEST On Time 6438 89.11 89.1% 7225
tidy_df %>% filter(Status == "Delayed") %>%
ggplot( aes(x=Airline, y=percent, fill=Airline)) +
geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
coord_flip() +
ggtitle("Overall Percentage of Flights Delayed by Airline") +
xlab("Airline") + ylab("Percentage of Flights Delayed") +
geom_text(aes(label=paste(percent_formatted, "\n n =", total_flights)), vjust=0.5, hjust=1.1,color="black")
flights_aggr_df <- clean_data %>% select(Airline, Status, Flights, Destination) %>%
group_by(Airline, Status, Destination) %>%
summarise(total = sum(Flights)) %>%
mutate(percent = round(total/sum(total) * 100, 2),
percent_formatted = paste0(round(total/sum(total) * 100, 1), "%"),
total_flights = sum(total))
flights_by_city_df <- flights_aggr_df %>%
group_by(Airline, Destination) %>%
mutate(percent = round(total/sum(total)*100, 2),
percent_formatted=paste0(round(total/sum(total)*100, 1), "%"))
flights_by_city_df <- as.data.frame(flights_by_city_df)
flights_by_city_df
## Airline Status Destination total percent percent_formatted
## 1 ALASKA Delayed Los.Angeles 62 11.09 11.1%
## 2 ALASKA Delayed Phoenix 12 5.15 5.2%
## 3 ALASKA Delayed San.Diego 20 8.62 8.6%
## 4 ALASKA Delayed San.Francisco 102 16.86 16.9%
## 5 ALASKA Delayed Seattle 305 14.21 14.2%
## 6 ALASKA On Time Los.Angeles 497 88.91 88.9%
## 7 ALASKA On Time Phoenix 221 94.85 94.8%
## 8 ALASKA On Time San.Diego 212 91.38 91.4%
## 9 ALASKA On Time San.Francisco 503 83.14 83.1%
## 10 ALASKA On Time Seattle 1841 85.79 85.8%
## 11 AM WEST Delayed Los.Angeles 117 14.43 14.4%
## 12 AM WEST Delayed Phoenix 415 7.90 7.9%
## 13 AM WEST Delayed San.Diego 65 14.51 14.5%
## 14 AM WEST Delayed San.Francisco 129 28.73 28.7%
## 15 AM WEST Delayed Seattle 61 23.28 23.3%
## 16 AM WEST On Time Los.Angeles 694 85.57 85.6%
## 17 AM WEST On Time Phoenix 4840 92.10 92.1%
## 18 AM WEST On Time San.Diego 383 85.49 85.5%
## 19 AM WEST On Time San.Francisco 320 71.27 71.3%
## 20 AM WEST On Time Seattle 201 76.72 76.7%
## total_flights
## 1 501
## 2 501
## 3 501
## 4 501
## 5 501
## 6 3274
## 7 3274
## 8 3274
## 9 3274
## 10 3274
## 11 787
## 12 787
## 13 787
## 14 787
## 15 787
## 16 6438
## 17 6438
## 18 6438
## 19 6438
## 20 6438
flights_by_city_df %>% filter(Status == "Delayed") %>%
ggplot( aes(x=Destination, y=percent, fill=Airline)) +
geom_bar(stat="identity", position=position_dodge(), colour="black") +
geom_text(aes(label=percent_formatted), vjust=.5, hjust=1,position= position_dodge(width=0.9), color="black") +
ggtitle("Percentage of Flights Delayed by Airline by City") +
xlab("Destination") + ylab("Percentage of Flights Delayed") +
coord_flip()