require(dplyr)
require(tidyr)
require(utils)
require(ggplot2)
library(dplyr)
library(tidyr)
library(utils)
library(ggplot2)
Read the flights CSV file from URL.
flights <- read.csv(url("https://raw.githubusercontent.com/v-sinha/data607/week_05/flights.csv"))
Display the structure and leading rows of the frame.
# Structure
str(flights)
## 'data.frame': 4 obs. of 7 variables:
## $ Airline : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 2 2
## $ flight.status: Factor w/ 2 levels " delayed"," on time": 2 1 2 1
## $ Los.Angeles : int 497 62 694 117
## $ Phoenix : int 221 12 4840 415
## $ San.Diego : int 212 20 383 65
## $ San.Francisco: int 503 102 320 129
## $ Seattle : int 1841 305 201 61
# Alternative to str()
glimpse(flights)
## Observations: 4
## Variables: 7
## $ Airline <fctr> ALASKA, ALASKA, AM WEST, AM WEST
## $ flight.status <fctr> on time, delayed, on time, delayed
## $ Los.Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San.Diego <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
# Leading rows us
head(flights)
## Airline flight.status Los.Angeles Phoenix San.Diego San.Francisco
## 1 ALASKA on time 497 221 212 503
## 2 ALASKA delayed 62 12 20 102
## 3 AM WEST on time 694 4840 383 320
## 4 AM WEST delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 3 201
## 4 61
The flight information appears to be a wide data frame, because columns 3-7 contain values for each destination, i.e. the column fields are really values. We can use tidyr’s gather() function to transform the wide dataframe into a long one.
# Select columns 3-7 and consolidate (gather) their values into one
# column labeled "Flightcount"
flights_long <- flights %>% gather(Destination, Flightcount, 3:7)
# Display the new structure and the first few rows.
str(flights_long)
## 'data.frame': 20 obs. of 4 variables:
## $ Airline : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 2 2 1 1 2 2 1 1 ...
## $ flight.status: Factor w/ 2 levels " delayed"," on time": 2 1 2 1 2 1 2 1 2 1 ...
## $ Destination : chr "Los.Angeles" "Los.Angeles" "Los.Angeles" "Los.Angeles" ...
## $ Flightcount : int 497 62 694 117 221 12 4840 415 212 20 ...
glimpse(flights_long)
## Observations: 20
## Variables: 4
## $ Airline <fctr> ALASKA, ALASKA, AM WEST, AM WEST, ALASKA, ALASK...
## $ flight.status <fctr> on time, delayed, on time, delayed, on tim...
## $ Destination <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Lo...
## $ Flightcount <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, ...
head(flights_long)
## Airline flight.status Destination Flightcount
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
Compare the dimensions of the original and the transformed frames.
dim(flights)
## [1] 4 7
dim(flights_long)
## [1] 20 4
names(flights)
## [1] "Airline" "flight.status" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
names(flights_long)
## [1] "Airline" "flight.status" "Destination" "Flightcount"
Compare the flights of the two airlines. For this we filter based on Airline and FlightStatus (i.e. “on time” or “delayed”).
# Extract the information for each of the two airlines.
flights_alaska <- flights_long %>% filter(Airline == "ALASKA")
flights_amwest <- flights_long %>% filter(Airline == "AM WEST")
# Extract the on-time and delayed information for both airlines.
ontime_alaska <- flights_alaska %>% filter(flight.status == " on time")
ontime_amwest <- flights_amwest %>% filter(flight.status == " on time")
delayed_alaska <- flights_alaska %>% filter(flight.status == " delayed")
delayed_amwest <- flights_amwest %>% filter(flight.status == " delayed")
print(ontime_alaska)
## Airline flight.status Destination Flightcount
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
print(ontime_amwest)
## Airline flight.status Destination Flightcount
## 1 AM WEST on time Los.Angeles 694
## 2 AM WEST on time Phoenix 4840
## 3 AM WEST on time San.Diego 383
## 4 AM WEST on time San.Francisco 320
## 5 AM WEST on time Seattle 201
print(delayed_alaska)
## Airline flight.status Destination Flightcount
## 1 ALASKA delayed Los.Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San.Diego 20
## 4 ALASKA delayed San.Francisco 102
## 5 ALASKA delayed Seattle 305
print(delayed_amwest)
## Airline flight.status Destination Flightcount
## 1 AM WEST delayed Los.Angeles 117
## 2 AM WEST delayed Phoenix 415
## 3 AM WEST delayed San.Diego 65
## 4 AM WEST delayed San.Francisco 129
## 5 AM WEST delayed Seattle 61
Compare the delay as a percentage for both airlines.
percent_alaska <- 100 * sum(delayed_alaska$Flightcount) / (sum(delayed_alaska$Flightcount) + sum(ontime_alaska$Flightcount))
percent_amwest <- 100 * sum(delayed_amwest$Flightcount) / (sum(delayed_amwest$Flightcount) + sum(ontime_amwest$Flightcount))
delays_percent <- data.frame(airline = c("ALASKA", "AM WEST"),
delays = c(percent_alaska, percent_amwest))
p <- ggplot(delays_percent, aes(airline, delays))
p +geom_bar(stat = "identity", width=.5)
The conclusion is that, for the selected destination cities, Alaska Airlines was, compared to America West, somewhat more prone to delays (13.2% vs 10.9%), with delays computed as a percentage of total flights completed for each airline.