raw.flights <- read.csv("https://raw.githubusercontent.com/aliceafriedman/DATA607_HW4/master/a_flights.csv", sep=",", header=TRUE, stringsAsFactors=FALSE)
glimpse(raw.flights)
## Observations: 5
## Variables: 7
## $ X <chr> "Alasaka", "", "", "AM WEST", ""
## $ X.1 <chr> "On time", "Delayed", "", "On time", "Delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 201, 61
# Add column names where missing, remove blank row
flights <- raw.flights %>%
rename("Airline"=X, "Status"=X.1) %>%
na.omit() %>%
glimpse()
## Observations: 4
## Variables: 7
## $ Airline <chr> "Alasaka", "", "AM WEST", ""
## $ Status <chr> "On time", "Delayed", "On time", "Delayed"
## $ Los.Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San.Diego <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
#Note: This is an easy place to introduce errors into the dataset!
#This would also not work wel for a larger data set.
flights$Airline[2] <- "Alaska"
flights$Airline[4] <- "AM WEST"
glimpse(flights)
## Observations: 4
## Variables: 7
## $ Airline <chr> "Alasaka", "Alaska", "AM WEST", "AM WEST"
## $ Status <chr> "On time", "Delayed", "On time", "Delayed"
## $ Los.Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San.Diego <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
The above is an easy way to introduce errors into the data*, and won’t work well in larger dataset. Is there a better way to do it?
i <- 2
while(i <= length(flights$Airline)) {
if(flights$Status[i]=="Delayed"){
flights$Airline[i] <- flights$Airline[i-1]
}
i <- i+2
}
glimpse(flights)
## Observations: 4
## Variables: 7
## $ Airline <chr> "Alasaka", "Alasaka", "AM WEST", "AM WEST"
## $ Status <chr> "On time", "Delayed", "On time", "Delayed"
## $ Los.Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San.Diego <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
*Note, in fact, that the data in flights$Airline doesn’t match! There is an error in the CSV, which we will correct below.
flights$Airline <- gsub("Alasaka", "Alaska", flights$Airline)
flights.long <- flights %>%
gather(key = city, value = count, Los.Angeles, Phoenix, San.Diego, San.Francisco, Seattle) %>%
glimpse()
## Observations: 20
## Variables: 4
## $ Airline <chr> "Alaska", "Alaska", "AM WEST", "AM WEST", "Alaska", "A...
## $ Status <chr> "On time", "Delayed", "On time", "Delayed", "On time",...
## $ city <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Los.Ange...
## $ count <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, 383, 6...
First, we will compare the overall record for each airline. Overall, AM WEST flys substantially more flights overall, and has more flights with delays, but as a percentage of all flights does slightly better than Alaska. This seems like AM WEST is your better bet to avoid delays!
compare.all <- flights.long %>%
select(-city) %>%
group_by(Airline) %>%
mutate(All.Flights = sum(count)) %>%
group_by(Airline, Status) %>%
mutate(Flights = sum(count), Percent.By.Status = Flights/All.Flights*100) %>%
group_by(Airline, Status, Flights, Percent.By.Status) %>%
summarise()
#Graph number of flights by status, airline
ggplot(compare.all, aes(x=Status, y=Flights, fill=Airline))+geom_col(position = "Dodge")+
ggtitle("Total Number of Flights by Airline, Status")
#Graph proportion of all flights by status, airline
compare.all %>% filter(Status=="Delayed") %>%
ggplot(aes(x=Airline, y=Percent.By.Status, fill=Airline))+geom_col()+
ggtitle("Frequency of Delayed Flights, by Airline")+
ylab("% of All Flights Delayed")
Just to be sure, we will compare the frequency of delayed flights arriving in each city. Per the data below, your best bet is actually always to take Alaska, unless you are flying equally to all cities!
Looks like the high number of delayed flights going into Seattle may be skewing the overall numbers.
compare.by.city <- flights.long %>%
group_by(Airline, city) %>%
mutate(
City.Flights = sum(count),
Status.Freq.By.City = count/City.Flights*100,
Delayed.Flights = count) %>%
filter(Status == "Delayed") %>%
select(-Status, -City.Flights, -count) %>%
arrange(city) %>%
as_tibble()
compare.by.city
## # A tibble: 10 x 4
## # Groups: Airline, city [10]
## Airline city Status.Freq.By.City Delayed.Flights
## <chr> <chr> <dbl> <int>
## 1 Alaska Los.Angeles 11.1 62
## 2 AM WEST Los.Angeles 14.4 117
## 3 Alaska Phoenix 5.15 12
## 4 AM WEST Phoenix 7.90 415
## 5 Alaska San.Diego 8.62 20
## 6 AM WEST San.Diego 14.5 65
## 7 Alaska San.Francisco 16.9 102
## 8 AM WEST San.Francisco 28.7 129
## 9 Alaska Seattle 14.2 305
## 10 AM WEST Seattle 23.3 61
ggplot(compare.by.city,
aes(x=city, y=Status.Freq.By.City, fill=Airline))+
geom_col(position = "dodge")+
ggtitle("Percent Flights Delayed by City")+
xlab("City")+
ylab("% Delayed")