library(knitr)
library(stringr)
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
airlines = matrix(c("Alaska","on time", 497, 221,212,503,1841,
"", "delayed",62,12,20,102,305,
"","","","","","","",
"AMWest","on time",694,4840,383,320,201,
"","delayed",117,415,65,129,61
),ncol=7,byrow=TRUE)
colnames(airlines) = c("","", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")
airlines = data.frame(airlines)
airlines
## V1 V2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3
## 4 AMWest on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# getwd()
write.csv(airlines, file = "Airlines.csv", row.names = F)
planes = read.csv("Airlines.csv")
planes
## V1 V2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWest on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Remove empty row and fill in empty cells.
planes = filter(planes, !is.na(Los.Angeles))
planes$V1[2] = "Alaska"
planes$V1[4] = "AMWest"
planes
## V1 V2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AMWest on time 694 4840 383 320 201
## 4 AMWest delayed 117 415 65 129 61
Rearrange columns and rows by stacking the cities into one column and then making the Delayed and Timely flights into two columns, each with its own values. Rename columns. Edit city names to replace the “.” with a space. Calculate percentages and means.
# Rearrange columns and rows
planes = planes %>% gather("Cities", "Flights", 3:7) %>%
spread(V2 , Flights)
# Rename columns
colnames(planes) = c("Airline", "City", "Delayed", "Timely")
# Edit city names
planes$City = str_replace_all(planes$City, "[.]", " ")
# Ratio of Delayed to Timely arrivals
planes = mutate(planes, Ratio = round((Delayed/Timely),2))
# Percent of Delayed and Timely arrivals
planes = mutate(planes, Percent_Delayed = round((Delayed/(Delayed+Timely))*100,2))
planes = mutate(planes, Percent_Timely = round((Timely/(Delayed+Timely))*100,2))
kable(planes)
| Airline | City | Delayed | Timely | Ratio | Percent_Delayed | Percent_Timely |
|---|---|---|---|---|---|---|
| Alaska | Los Angeles | 62 | 497 | 0.12 | 11.09 | 88.91 |
| Alaska | Phoenix | 12 | 221 | 0.05 | 5.15 | 94.85 |
| Alaska | San Diego | 20 | 212 | 0.09 | 8.62 | 91.38 |
| Alaska | San Francisco | 102 | 503 | 0.20 | 16.86 | 83.14 |
| Alaska | Seattle | 305 | 1841 | 0.17 | 14.21 | 85.79 |
| AMWest | Los Angeles | 117 | 694 | 0.17 | 14.43 | 85.57 |
| AMWest | Phoenix | 415 | 4840 | 0.09 | 7.90 | 92.10 |
| AMWest | San Diego | 65 | 383 | 0.17 | 14.51 | 85.49 |
| AMWest | San Francisco | 129 | 320 | 0.40 | 28.73 | 71.27 |
| AMWest | Seattle | 61 | 201 | 0.30 | 23.28 | 76.72 |
Create a table of means and means of percentages of delays and timely arrivals.
# Means and Sums of Delayed vs Timely by Airline
kable(planes %>%
group_by(Airline) %>%
summarise(Mean_Delays = mean(Delayed),
Mean_Timely = mean(Timely),
Mean_Percent_Delays = mean(Percent_Delayed),
Mean_Percent_Timely = mean(Percent_Timely),
Sum_Delays = sum(Delayed),
Sum_Timely = sum(Timely),
Total_Flights = sum(Sum_Delays, Sum_Timely)))
| Airline | Mean_Delays | Mean_Timely | Mean_Percent_Delays | Mean_Percent_Timely | Sum_Delays | Sum_Timely | Total_Flights |
|---|---|---|---|---|---|---|---|
| Alaska | 100.2 | 654.8 | 11.186 | 88.814 | 501 | 3274 | 3775 |
| AMWest | 157.4 | 1287.6 | 17.770 | 82.230 | 787 | 6438 | 7225 |
Here, we see that AMWest has twice as many total flights than Alaska airlines, however Alaska airlines has a higher mean of percent of timely arrivals, meaning that despite having fewer flights, Alaska is less likely to have delays than AMWest.
PLOTS
Comparison of the number of delayed flights and on-time flights between the two airlines based on the city.
deltim = gather(planes, "Arrival", "n", 3:4 )
ggplot(deltim, aes(x=Arrival, y=n, colour = Airline)) +
geom_bar( aes(fill= Airline), stat="identity", position=position_dodge())+
facet_grid(~ planes$City)
In this plot, we see that the Alaska airline has fewer delays than AMWest in each city except for Seattle. However, it also has fewer on-time arrivals, the exceptions being San Francisco and Seattle. Simply put, AMWest has more traffic than Alaska, especially in Phoenix.
Comparison of the percentage of delayed flights and on-time flights between the two airlines based on the city.
percents = gather(planes, "Percents", "n", 6:7 )
ggplot(percents, aes(x=Airline, y=n, colour = Percents)) +
geom_bar( aes(fill= Percents), stat="identity", position=position_dodge())+
facet_grid(~ planes$City)
Because the previous plot could be misleading due to the airplane traffic in each city, viewing the percentages of flight arrivals could be more helpful. In this plot, we see that the Alaska airline has a lower percentage of delays than AMWest in every city. Regardless of which city is picked (one with a lot of traffic like Phoenix or one with fewer flights like San Diego), the Alaska airline is more likely to arrive on time than AMWest.
Comparison of the number of delayed and on-time flights between cities based on the airline.
ggplot(deltim, aes(x=Arrival, y=n, colour = City)) +
geom_bar( aes(fill= City), stat="identity", position=position_dodge())+
facet_grid(~ planes$Airline)
Here we see that the Alaska airline has a stronger base in Seattle than AMWest, while AMWest has a stronger base in Phoenix. On the other hand, the Alaska airline has a weak base in Phoenix and San Diego, while AMWest has the fewest flights in Seattle.
Comparison of the percentages of delayed and on-time flights between cities based on the airline.
ggplot(percents, aes(x=Percents, y=n, colour = City)) +
geom_bar( aes(fill= City), stat="identity", position=position_dodge())+
facet_grid(~ planes$Airline)
When viewing the percentages of delayed and timely arrivals among the cities, we see that the Alaska airline has its highest Timely percentage in Phoenix and its higher percentage of delays in San Francisco. AMWest also has its higher Timely percentage in Phoenix (even if it is not as high as Alaska’s percentage) and has its highest percentage of delays in San Francisco (a higher percentage than Alaska).
To summarize: Alaska airlines has its most flights in Seattle but its highest percentage of timely arrivals in Phoenix. Conversly, it has its fewest flights in Phoenix and San Diego even though its highest percentage of delays are in San Francisco. AMWest, on the other hand has its most flights and its highest percentagy of timely arrivals in Phoenix (though Alaska has a higher timely percentage comparatively). On the other hand, it has the least traffic in Seattle and the highest percentage of delays in San Francisco (higher comparatively even to the Alaska airline). Overall, though, when viewing the timely and delayed percentages side by side, even though the Alaska airline is consistently higher than AMWest, AMWest is not too far behind in terms of on-time percentages.