Install and load in the neccesary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(dplyr)
library(ggplot2)
Load the data into a data frame and display
flights_data_wide <- read.csv('https://raw.githubusercontent.com/ErickH1/DATA607Assignment4/refs/heads/main/data607hw4data.csv')
head(flights_data_wide)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
glimpse(flights_data_wide)
## Rows: 5
## Columns: 7
## $ X <chr> "ALASKA", "", "", "AM WEST", ""
## $ X.1 <chr> "on time", "delayed", "", "on time", "delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 201, 61
Tidying up the data by removing na values
flights_data_wide <- flights_data_wide %>%
drop_na() %>%
rename(airline = X, status = X.1)
flights_data_wide <- flights_data_wide %>%
mutate(airline = na_if(airline, '')) %>%
fill(airline, .direction = c('down'))
Making the data frame long
flights_data_long <- flights_data_wide %>%
pivot_longer(cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
names_to = "city",
values_to = "count")
head(flights_data_long)
## # A tibble: 6 × 4
## airline status city count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
Finally tidying the data to the desired dataframe
flights_data_tidy <- flights_data_long %>%
pivot_wider(names_from = status,
values_from = count)
names(flights_data_tidy)[3] <- "on_time"
head(flights_data_tidy)
## # A tibble: 6 × 4
## airline city on_time delayed
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 497 62
## 2 ALASKA Phoenix 221 12
## 3 ALASKA San.Diego 212 20
## 4 ALASKA San.Francisco 503 102
## 5 ALASKA Seattle 1841 305
## 6 AM WEST Los.Angeles 694 117
Adding delay and on time rate as their own columns
flights_data_tidy <- flights_data_tidy %>%
mutate(delayed_rate = delayed / (on_time + delayed))
flights_data_tidy <- flights_data_tidy %>%
mutate(ontime_rate = on_time / (on_time + delayed))
glimpse(flights_data_tidy)
## Rows: 10
## Columns: 6
## $ airline <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AM WES…
## $ city <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "…
## $ on_time <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
## $ delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ delayed_rate <dbl> 0.11091234, 0.05150215, 0.08620690, 0.16859504, 0.1421248…
## $ ontime_rate <dbl> 0.8890877, 0.9484979, 0.9137931, 0.8314050, 0.8578751, 0.…
Create and display plots
ggplot(flights_data_tidy, aes(fill= airline, y = delayed_rate, x = city)) +
geom_bar(position="dodge", stat = "identity")
ggplot(flights_data_tidy, aes(fill= airline, y = ontime_rate, x = city)) +
geom_bar(position="dodge", stat = "identity")
In conclusion, based on the analysis on the two airlines, AM West has a higher delayed rate in every arrival in a city.