1)Create a .CSV file that includes all of the information on the flights table. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations.
library(tidyr)
library(dplyr)
library("ggplot2")
csv <- rbind(c("Airlines", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
c("ALASKA", "Delayed", 62, 12, 20, 102, 305),
c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
c("AM WEST", "Delayed", 117, 415, 65, 129, 61))
write.table(csv, file = "flightss.csv", sep = ",", col.names=F, row.names=F)
2)Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
raw_data <- read.csv(paste0("https://raw.githubusercontent.com/ErindaB/Data607_Assignment/master/flights.csv"),stringsAsFactors = F);
raw_data
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA On Time 497 221 212 503 1841
## 2 ALASKA Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 AM WEST Delayed 117 415 65 129 61
There are 5 columns for each city.Using gather from tidyr we can have them in only 1 column.
tidyr_data <- gather(raw_data, City, Total, Los.Angeles:Seattle) %>%
mutate(City = gsub( "\\.", " ", City)) %>% ## using mutate to remove . from city names
arrange(Airlines)
tidyr_data
## Airlines Status City Total
## 1 ALASKA On Time Los Angeles 497
## 2 ALASKA Delayed Los Angeles 62
## 3 ALASKA On Time Phoenix 221
## 4 ALASKA Delayed Phoenix 12
## 5 ALASKA On Time San Diego 212
## 6 ALASKA Delayed San Diego 20
## 7 ALASKA On Time San Francisco 503
## 8 ALASKA Delayed San Francisco 102
## 9 ALASKA On Time Seattle 1841
## 10 ALASKA Delayed Seattle 305
## 11 AM WEST On Time Los Angeles 694
## 12 AM WEST Delayed Los Angeles 117
## 13 AM WEST On Time Phoenix 4840
## 14 AM WEST Delayed Phoenix 415
## 15 AM WEST On Time San Diego 383
## 16 AM WEST Delayed San Diego 65
## 17 AM WEST On Time San Francisco 320
## 18 AM WEST Delayed San Francisco 129
## 19 AM WEST On Time Seattle 201
## 20 AM WEST Delayed Seattle 61
Use of dplyr to tidy and transform the data
spread_data <- spread(tidyr_data, "Status", "Total")
spread_data
## Airlines City Delayed On Time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
#arrange rows by City
spread_data2 <- spread_data %>%
arrange(City, Airlines)
spread_data2
## Airlines City Delayed On Time
## 1 ALASKA Los Angeles 62 497
## 2 AM WEST Los Angeles 117 694
## 3 ALASKA Phoenix 12 221
## 4 AM WEST Phoenix 415 4840
## 5 ALASKA San Diego 20 212
## 6 AM WEST San Diego 65 383
## 7 ALASKA San Francisco 102 503
## 8 AM WEST San Francisco 129 320
## 9 ALASKA Seattle 305 1841
## 10 AM WEST Seattle 61 201
#calculate the percentages of delayed and on time flights for each airline and destination
spread_data3 <- spread_data2 %>%
mutate(total = Delayed + `On Time`, percentDelayed = round(Delayed / total * 100, 2),
percentOnTime = round(100 * `On Time` / total, 2))
spread_data3
## Airlines City Delayed On Time total percentDelayed
## 1 ALASKA Los Angeles 62 497 559 11.09
## 2 AM WEST Los Angeles 117 694 811 14.43
## 3 ALASKA Phoenix 12 221 233 5.15
## 4 AM WEST Phoenix 415 4840 5255 7.90
## 5 ALASKA San Diego 20 212 232 8.62
## 6 AM WEST San Diego 65 383 448 14.51
## 7 ALASKA San Francisco 102 503 605 16.86
## 8 AM WEST San Francisco 129 320 449 28.73
## 9 ALASKA Seattle 305 1841 2146 14.21
## 10 AM WEST Seattle 61 201 262 23.28
## percentOnTime
## 1 88.91
## 2 85.57
## 3 94.85
## 4 92.10
## 5 91.38
## 6 85.49
## 7 83.14
## 8 71.27
## 9 85.79
## 10 76.72
# Compare percentage of total delayed flights for each airline by creating a new summary table
sumDf <- spread_data3 %>%
group_by(Airlines) %>%
summarise(totalDelayed = sum(Delayed), totalOnTime = sum(`On Time`), totalFlights = sum(total)) %>%
mutate(percentDelayed = round(totalDelayed/totalFlights * 100, 2))
sumDf
## # A tibble: 2 x 5
## Airlines totalDelayed totalOnTime totalFlights percentDelayed
## <chr> <int> <int> <int> <dbl>
## 1 ALASKA 501 3274 3775 13.3
## 2 AM WEST 787 6438 7225 10.9
#Total Percentage of Delayed Flights by Airline
TotPercDelayedFlight <- ggplot(sumDf, aes(Airlines, percentDelayed)) +
geom_bar(aes(fill = Airlines), position = "dodge", stat = "identity") +
geom_text(aes(label = percentDelayed), vjust = -.5) +
ylab("Percentage of Delayed Flights") +
ggtitle("Total Percentage of Delayed Flights by Airline")
TotPercDelayedFlight
#Compare the percentages of delayed flights for each airline by cities in a bar graph
PercDelayedAirCities <- ggplot(spread_data3, aes(y=percentDelayed, x = Airlines, color = Airlines, fill = Airlines)) +
geom_bar( stat = "identity") +
geom_text(aes(label = percentDelayed), vjust = -.5) +
facet_wrap(~City) +
ylim(0, 55) +
ylab("Percentage of Delayed Flights") +
ggtitle("Percentage of Delayed Flights by Airline and Cities")
PercDelayedAirCities
#Total number of Flights by Airline and Destination
PercDelayedAirCities1 <- ggplot(spread_data3, aes(y = total, x = Airlines, color = Airlines, fill = Airlines)) + geom_bar( stat = "identity") +
geom_text(aes(label = total), vjust = -.3) +
facet_wrap(~City) +
ylim(0, 5500) +
ylab("Number of Flights") +
ggtitle("Number of Fights by Airline and Cities")
PercDelayedAirCities1
#Compare the percentages of delayed flights for each airline by cities in a line graph
PercDelayedAirCities1<- ggplot(spread_data3, aes(x = City, y = percentDelayed, group = Airlines, color = Airlines, shape = Airlines)) +
geom_point(aes(shape = Airlines)) +
geom_line() +
geom_text(aes(label = percentDelayed), vjust = -.4) +
labs(x = "Destination", y = "Percentage of Delayed flights") +
theme(legend.title=element_blank()) +
ggtitle("Percentage of Delayed Flights by Airline and Cities")
PercDelayedAirCities1
To sum up, AM West Airlines has a lower percentage of total delayed flights than Alaska Airlines. However, Alaska has lower percentages of delayed flights than AM West by destination. AM West has 5255 flights to Phoenix with 7.9% of those flights being delayed, while Alaska has only 233 Phoenix flights with 5.15% of flights delayed.Also AM West has 262 flights to Seattle with 23.28% of flights delayed, while Alaska has 2146 Seattle flights with 14.21% delayed.