Import data from CSV
path <- 'https://raw.githubusercontent.com/dhairavc/DATA607/master/Flight_Delay.csv'
Flight_Delays <- read.csv(path)
Flight_Delays
## X X.1 Los.Angeles Phonex San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Tidy data to a usable point, but not in completely tidy format
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Flight_Delays[2,1] <- "ALASKA"
Flight_Delays[5,1] <- "AM WEST"
tidy_flights <- Flight_Delays %>% na.omit()
tidy_flights <- tidy_flights %>% gather("City", "Count", 3:7)
colnames(tidy_flights) <- c("Airline", "Status", "City", "Count")
tidy_flights$City <- str_replace(tidy_flights$City, "\\.", " ")
tidy_flights %>% arrange(Airline, City)
## Airline Status City Count
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 ALASKA on time Phonex 221
## 4 ALASKA delayed Phonex 12
## 5 ALASKA on time San Diego 212
## 6 ALASKA delayed San Diego 20
## 7 ALASKA on time San Francisco 503
## 8 ALASKA delayed San Francisco 102
## 9 ALASKA on time Seattle 1841
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST delayed Los Angeles 117
## 13 AM WEST on time Phonex 4840
## 14 AM WEST delayed Phonex 415
## 15 AM WEST on time San Diego 383
## 16 AM WEST delayed San Diego 65
## 17 AM WEST on time San Francisco 320
## 18 AM WEST delayed San Francisco 129
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Looking at the data just by the city we notice
library(ggplot2)
library(tibble)
Total_OnTime <- tidy_flights %>% filter(Status == "on time") %>% select("City", "Count") %>% group_by(City) %>% summarise(Total_OnTime = sum(Count))
Total_Delayed <- tidy_flights %>% filter(Status == "delayed") %>% select("City", "Count") %>% group_by(City) %>% summarise(Total_Delayed = sum(Count))
City_Totals <- inner_join(Total_OnTime, Total_Delayed, by = "City")
City_Totals$Ontime_Rate <- City_Totals$Total_OnTime/(City_Totals$Total_OnTime + City_Totals$Total_Delayed)
City_Totals
## # A tibble: 5 x 4
## City Total_OnTime Total_Delayed Ontime_Rate
## <chr> <int> <int> <dbl>
## 1 Los Angeles 1191 179 0.869
## 2 Phonex 5061 427 0.922
## 3 San Diego 595 85 0.875
## 4 San Francisco 823 231 0.781
## 5 Seattle 2042 366 0.848
City_Totals_Sum <- City_Totals %>% gather("Status", "Count", 2:3)
ggplot(City_Totals_Sum, aes(x = City_Totals_Sum$City, y = City_Totals_Sum$Count)) +
geom_bar(aes(fill = City_Totals_Sum$Status), position = "dodge", stat="identity") +
xlab("Airports") +
ylab("Count") +
labs(fill = "Legend")
Looking at the data just by Airline, we learn
library(tibble)
tidy_flights2 <- tidy_flights %>% spread(Status, Count)
colnames(tidy_flights2) <- c("Airline", "City", "Delayed", "OnTime")
alaska <- tidy_flights2 %>% filter(Airline == "ALASKA")
amwest <- tidy_flights2 %>% filter(Airline == "AM WEST")
Airline_Totals <- rbind(alaska %>% select("Airline", "Delayed", "OnTime") %>% summarise(Airline = unique(Airline), Delayed = sum(Delayed), OnTime = sum(OnTime)),
amwest %>% select("Airline", "Delayed", "OnTime") %>% summarise(Airline = unique(Airline), Delayed = sum(Delayed), OnTime = sum(OnTime))
)
Airline_Totals %>% gather(Status, Count, -Airline) %>%
ggplot(aes(Airline, Count, fill = Status)) +
geom_col(position = "dodge", color = "black") + #adding black outline to bar
facet_wrap(~Airline, scales = "free_x") +
scale_fill_manual(values = c("plum1", "slategray1")) + #adding custom colors
theme_bw() + #plot background theme
geom_text(aes(label = Count), position = position_dodge(.9), vjust = 1.5) #bar counts
add_column(Airline_Totals, Delay_Percent = Airline_Totals$Delayed/(rowSums(Airline_Totals[2:3])))
## Airline Delayed OnTime Delay_Percent
## 1 ALASKA 501 3274 0.1327152
## 2 AM WEST 787 6438 0.1089273
When comparing both Airline and City we learn that
library(dplyr)
library(tibble)
library(RColorBrewer)
tidy_flights3 <- add_column(bind_rows(alaska,amwest), Delay_Percent = tidy_flights2$Delayed/rowSums(tidy_flights2[3:4])) %>% arrange(Delay_Percent)
tidy_flights3
## Airline City Delayed OnTime Delay_Percent
## 1 ALASKA Phonex 12 221 0.05150215
## 2 AM WEST Phonex 415 4840 0.07897241
## 3 ALASKA San Diego 20 212 0.08620690
## 4 ALASKA Los Angeles 62 497 0.11091234
## 5 ALASKA Seattle 305 1841 0.14212488
## 6 AM WEST Los Angeles 117 694 0.14426634
## 7 AM WEST San Diego 65 383 0.14508929
## 8 ALASKA San Francisco 102 503 0.16859504
## 9 AM WEST Seattle 61 201 0.23282443
## 10 AM WEST San Francisco 129 320 0.28730512
tidy_flights3 %>% ggplot(aes(x = City, y = Delay_Percent, color=Airline)) +
geom_point(aes(fill = Airline), size=5) +
scale_color_brewer(palette = "Accent")
Based on the limited analysis