library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ggplot2)
flights = read.csv(file="flights.csv", header=TRUE, sep=",")
flights = filter(flights, X.1 != "")
print(flights)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
for (i in 1:nrow(flights)) {
if (flights$X[i] == "") {
flights$X[i] = flights$X[i-1]
}
}
print(flights)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
flights_df = gather(flights, "City", "OnTime_Delayed", 3, 4, 5, 6, 7 , na.rm = FALSE, convert = FALSE)
colnames(flights_df) = c("Airlines", "OnTime_Delayed", "City", "Count")
print (flights_df)
## Airlines OnTime_Delayed City Count
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AM West on time Los.Angeles 694
## 4 AM West delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM West on time Phoenix 4840
## 8 AM West delayed Phoenix 415
## 9 Alaska on time San.Diego 212
## 10 Alaska delayed San.Diego 20
## 11 AM West on time San.Diego 383
## 12 AM West delayed San.Diego 65
## 13 Alaska on time San.Francisco 503
## 14 Alaska delayed San.Francisco 102
## 15 AM West on time San.Francisco 320
## 16 AM West delayed San.Francisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West delayed Seattle 61
flights_df$City = str_replace_all(flights_df$City, "[[.]]", " ")
print (flights_df)
## Airlines OnTime_Delayed City Count
## 1 Alaska on time Los Angeles 497
## 2 Alaska delayed Los Angeles 62
## 3 AM West on time Los Angeles 694
## 4 AM West delayed Los Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM West on time Phoenix 4840
## 8 AM West delayed Phoenix 415
## 9 Alaska on time San Diego 212
## 10 Alaska delayed San Diego 20
## 11 AM West on time San Diego 383
## 12 AM West delayed San Diego 65
## 13 Alaska on time San Francisco 503
## 14 Alaska delayed San Francisco 102
## 15 AM West on time San Francisco 320
## 16 AM West delayed San Francisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West delayed Seattle 61
flights_df = spread(flights_df, OnTime_Delayed, Count)
colnames(flights_df) = c("Airlines", "City", "Delayed", "OnTime")
print (flights_df)
## Airlines City Delayed OnTime
## 1 Alaska Los Angeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska San Diego 20 212
## 4 Alaska San Francisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM West Los Angeles 117 694
## 7 AM West Phoenix 415 4840
## 8 AM West San Diego 65 383
## 9 AM West San Francisco 129 320
## 10 AM West Seattle 61 201
calcPercentage = function(value, totalValue) {
round((value/totalValue)*100, 2)
}
flights_df = mutate(flights_df, TotalFlights=(Delayed+OnTime), PercentDelayed=calcPercentage(Delayed, TotalFlights), PercentOnTime=calcPercentage(OnTime, TotalFlights))
print (flights_df)
## Airlines City Delayed OnTime TotalFlights PercentDelayed PercentOnTime
## 1 Alaska Los Angeles 62 497 559 11.09 88.91
## 2 Alaska Phoenix 12 221 233 5.15 94.85
## 3 Alaska San Diego 20 212 232 8.62 91.38
## 4 Alaska San Francisco 102 503 605 16.86 83.14
## 5 Alaska Seattle 305 1841 2146 14.21 85.79
## 6 AM West Los Angeles 117 694 811 14.43 85.57
## 7 AM West Phoenix 415 4840 5255 7.90 92.10
## 8 AM West San Diego 65 383 448 14.51 85.49
## 9 AM West San Francisco 129 320 449 28.73 71.27
## 10 AM West Seattle 61 201 262 23.28 76.72
flights_city_group_df=sqldf("select City, sum(TotalFlights) As TotalFlights from flights_df group by City")
## Loading required package: tcltk
print(flights_city_group_df)
## City TotalFlights
## 1 Los Angeles 1370
## 2 Phoenix 5488
## 3 San Diego 680
## 4 San Francisco 1054
## 5 Seattle 2408
ggplot(flights_city_group_df, aes(x=City, y=TotalFlights)) + geom_bar(fill="#8877aa", color="black", stat="identity")
flights_airline_group_df=sqldf("select Airlines, sum(TotalFlights) As TotalFlights from flights_df group by Airlines")
print(flights_airline_group_df)
## Airlines TotalFlights
## 1 AM West 7225
## 2 Alaska 3775
ggplot(flights_airline_group_df, aes(x=Airlines, y=TotalFlights)) + geom_bar(fill="#87ffaa", color="black", stat="identity")
ggplot(data=flights_df, aes(x=City, y=PercentDelayed, fill=Airlines)) + geom_bar(stat="identity", position="dodge") +
ylab("% of Flights Delayed") + ggtitle("% of Flights Delayed")
ggplot(data=flights_df, aes(x=City, y=PercentOnTime, fill=Airlines)) + geom_bar(stat="identity", position="dodge") +
ylab("% of Flights OnTime") + ggtitle("% of Flights OnTime")
ggplot(data=flights_df, aes(x=City, y=PercentDelayed, group=Airlines, color=Airlines)) + geom_line() +
geom_point(color = "Red") + ylab("% of Flights Delayed") + ggtitle("% of Flights Delayed")
ggplot(data=flights_df, aes(x=City, y=PercentOnTime, group=Airlines, color=Airlines)) + geom_line() +
geom_point(color = "Blue") + ylab("% of Flights OnTime") + ggtitle("% of Flights OnTime")
(4.a) From the graphs we can conclude that Alaska airlines has better OnTime performance compared to AMWest airlines though the total number of flights by AM West airlines is more than 2 times as many as Alaska airlines.
AM West airlines has a worse performance compared to Alaska Airlines as percentage of flights delayed for each airport is more than Alaska Airlines. Also, we notice that the highest delays by AM West are for San Franciso and the lowest are for Phoenix.