Import the Necessary Libraries and Import the Data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(readr)
df <- read_csv("https://raw.githubusercontent.com/sphill12/DATA607/main/Week_5_data_607.csv")
## New names:
## Rows: 5 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ...2 dbl (5): Los Angeles, Phoenix, San Diego, San Francisco,
## Seattle
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `` -> `...2`
head(df)
## # A tibble: 5 × 7
## ...1 ...2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Rather than have a column for each location, I made all locations
into one column, and added a “total” column to display the values that
were in each column.
longer_df <- pivot_longer(df, cols = 3:7, names_to = "Location", values_to = "total")
head(longer_df)
## # A tibble: 6 × 4
## airline delay Location total
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
For visualizations, I first displayed the difference in total delays
and on times between the two airlines
airline_group <- data.frame(longer_df %>% group_by(airline, delay) %>% summarize(total = sum(total)))
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
airline_group
## airline delay total
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
ggplot(data = airline_group, aes(x = airline, y = total, fill = delay)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label= airline_group$total), position=position_dodge(width=0.9), vjust=-0.25)
## Warning: Use of `airline_group$total` is discouraged.
## ℹ Use `total` instead.

airline_number <- airline_group[airline_group$total == "delayed"]
I wanted to show the ratio of delays to on time counts. Alaska has a
higher proportion of delays.
airline_group %>%
group_by(airline) %>%
summarise(delayed_to_ontime_ratio = sum(total[delay == "delayed"]) / sum(total[delay == "on time"]))
## # A tibble: 2 × 2
## airline delayed_to_ontime_ratio
## <chr> <dbl>
## 1 ALASKA 0.153
## 2 AM WEST 0.122
airline_number <- airline_group$total[airline_group$delay == "delayed"] / airline_group$total[airline_group$delay == "on time"]
airline_number
## [1] 0.1530238 0.1222429
Here I showed the delays and on times for each location. We can see
that Pheonix contains much more traffic than other locations. Overall
San Fransisco appears to be the worst location to have flights, as the
delays are quite high compared to on time flights.
location_group <- data.frame(longer_df %>% group_by(Location, delay) %>% summarise(total = sum(total)))
## `summarise()` has grouped output by 'Location'. You can override using the
## `.groups` argument.
ggplot(data = location_group, aes(x = Location, y = total, fill = delay)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label=total), position=position_dodge(width=0.9), vjust=-0.25)

location_group
## Location delay total
## 1 Los Angeles delayed 179
## 2 Los Angeles on time 1191
## 3 Phoenix delayed 427
## 4 Phoenix on time 5061
## 5 San Diego delayed 85
## 6 San Diego on time 595
## 7 San Francisco delayed 231
## 8 San Francisco on time 823
## 9 Seattle delayed 366
## 10 Seattle on time 2042
I separated the data into two plots for locations. One that shows
the delays, and one that shows the on time for each airline. We can see
that Pheonix is a hot area for AM west, while Seattle is a hot area for
Alaska.
delayed = longer_df %>% filter(delay == "delayed")
ggplot(data = delayed, aes(x = Location, y = total, fill = airline)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label=total), position=position_dodge(width=0.9), vjust=-0.25)

on_time = longer_df %>% filter(delay == "on time")
ggplot(data = on_time, aes(x = Location, y = total, fill = airline)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label=total), position=position_dodge(width=0.9), vjust=-0.25)

location_ratios <- data.frame(longer_df %>%
group_by(Location, airline) %>%
summarise(delayed_to_ontime_ratio = (sum(total[delay == "delayed"]) / sum(total[delay == "on time"]))))
## `summarise()` has grouped output by 'Location'. You can override using the
## `.groups` argument.