library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
I manually typed the dataset into a .csv file via vim which I then pushed to GitHub. I kept the formatting of the table to reflect the ‘wide’ structure it was presented in. Reading into our R dataframe from the remote repo.
data_url <- "https://raw.githubusercontent.com/andrewbowen19/cunyDATA607/main/data/flight-arrival-delays.csv"
df <- read.delim(data_url, header=TRUE, sep=",")
df
## Carrier Status Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Removing/re-formatting some of the missing values in our raw dataset.
# Removing empty row with NA values
df <- df %>%
drop_na(Los.Angeles, Phoenix, San.Diego, San.Fransisco, Seattle)
# Filling in missing carrier vals
df[2, "Carrier"] <- "Alaska"
df[4, "Carrier"] <- "AMWEST"
df
## Carrier Status Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
It looks like we’d be able to describe this dataset across 4 key dimensions: - Airline (ALASKA or AMWEST) - Status (on time vs delayed) - Destination (e.g, Los Angeles, Phoenix, etc.) - Arrival Delay Count (how many flights arrived at those airports)
Using tidyr’s
gather function which transforms our dataset into
key-value pairs across multiple columns. This transforms our dataframe
into a nice, clean ‘long’ format which we can use more easily in our
analysis
df <- tidyr::gather(df, "dest", "arrival_delays", 3:7)
head(df)
## Carrier Status dest arrival_delays
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
# Renaming our columns to snake_case
df <- df %>% rename(carrier=Carrier, status=Status, destination = dest)
# Replacing city names with proper spelling
df <- df %>%
mutate(across('destination', str_replace, "\\.", " "))
df
## carrier status destination arrival_delays
## 1 Alaska on time Los Angeles 497
## 2 Alaska delayed Los Angeles 62
## 3 AMWEST on time Los Angeles 694
## 4 AMWEST delayed Los Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 Alaska on time San Diego 212
## 10 Alaska delayed San Diego 20
## 11 AMWEST on time San Diego 383
## 12 AMWEST delayed San Diego 65
## 13 Alaska on time San Fransisco 503
## 14 Alaska delayed San Fransisco 102
## 15 AMWEST on time San Fransisco 320
## 16 AMWEST delayed San Fransisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
# Let's see the total delays for each airline
delays_by_airline <- df %>%
filter(status == "delayed") %>%
group_by(carrier) %>%
summarise(
total_delays=sum(arrival_delays)
)
delays_by_airline
## # A tibble: 2 × 2
## carrier total_delays
## <chr> <int>
## 1 Alaska 501
## 2 AMWEST 787
Plotting the delayed arrivals by destination
delays <- df %>% filter(status == "delayed") %>%
group_by(destination) %>%
summarise(total_delays = sum(arrival_delays))
ggplot(delays, aes(x=destination, y=total_delays, fill=factor(destination))) + geom_bar(stat='identity')
Plotting the all arrivals (delayed and on time) for all flights across
each airline.
delays_by_airline_status <- df %>%
group_by(carrier, status) %>%
summarise(total_delays = sum(arrival_delays))
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
delays_by_airline_status
## # A tibble: 4 × 3
## # Groups: carrier [2]
## carrier status total_delays
## <chr> <chr> <int>
## 1 Alaska delayed 501
## 2 Alaska on time 3274
## 3 AMWEST delayed 787
## 4 AMWEST on time 6438
ggplot(delays_by_airline_status, aes(x=interaction(carrier, status), y=total_delays, fill=factor(carrier))) + geom_bar(stat='identity')
Let’s also look at delayed flights at each destination by airline
delays2 <- df %>%
filter(status=='delayed')
ggplot(delays2, aes( x=destination, y=arrival_delays, fill=carrier)) +
geom_bar(position="dodge", stat="identity")