library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
Using Github to extra my data to be analyze
data <- read.csv("https://raw.githubusercontent.com/MAB592/Data-607-Assignments/main/Airline_wk5%20-%20Sheet1.csv")
print (data)
## Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
Here I am renaming the columns where I see fit
rename_data <- data %>%
rename("Flight Status" = Flight.Status,
"Los Angeles" = Los.Angeles,
"San Diego" = San.Diego,
"San Francisco" = San.Francisco
)
print(rename_data)
## Airline Flight Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
I am now converting my data to a more longer format to make it easier to graph my data
long_data <- rename_data %>%
pivot_longer (
cols = c("Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
names_to = "Location",
values_to = "Amount of flights"
)
print(long_data)
## # A tibble: 20 × 4
## Airline `Flight Status` Location `Amount of flights`
## <chr> <chr> <chr> <int>
## 1 ALASKA "on time " Los Angeles 497
## 2 ALASKA "on time " Phoenix 221
## 3 ALASKA "on time " San Diego 212
## 4 ALASKA "on time " San Francisco 503
## 5 ALASKA "on time " Seattle 1841
## 6 <NA> "delayed " Los Angeles 62
## 7 <NA> "delayed " Phoenix 12
## 8 <NA> "delayed " San Diego 20
## 9 <NA> "delayed " San Francisco 102
## 10 <NA> "delayed " Seattle 305
## 11 AM WEST "on time " Los Angeles 694
## 12 AM WEST "on time " Phoenix 4840
## 13 AM WEST "on time " San Diego 383
## 14 AM WEST "on time " San Francisco 320
## 15 AM WEST "on time " Seattle 201
## 16 <NA> "delayed " Los Angeles 117
## 17 <NA> "delayed " Phoenix 415
## 18 <NA> "delayed " San Diego 65
## 19 <NA> "delayed " San Francisco 129
## 20 <NA> "delayed " Seattle 61
Filling the missing data from the Airline column that contains NA using the mutate function
final_data <- long_data %>%
mutate(Airline = as.character(na_if(Airline, 'NA'))) %>%
fill(Airline, .direction = 'down')
print(final_data)
## # A tibble: 20 × 4
## Airline `Flight Status` Location `Amount of flights`
## <chr> <chr> <chr> <int>
## 1 ALASKA "on time " Los Angeles 497
## 2 ALASKA "on time " Phoenix 221
## 3 ALASKA "on time " San Diego 212
## 4 ALASKA "on time " San Francisco 503
## 5 ALASKA "on time " Seattle 1841
## 6 ALASKA "delayed " Los Angeles 62
## 7 ALASKA "delayed " Phoenix 12
## 8 ALASKA "delayed " San Diego 20
## 9 ALASKA "delayed " San Francisco 102
## 10 ALASKA "delayed " Seattle 305
## 11 AM WEST "on time " Los Angeles 694
## 12 AM WEST "on time " Phoenix 4840
## 13 AM WEST "on time " San Diego 383
## 14 AM WEST "on time " San Francisco 320
## 15 AM WEST "on time " Seattle 201
## 16 AM WEST "delayed " Los Angeles 117
## 17 AM WEST "delayed " Phoenix 415
## 18 AM WEST "delayed " San Diego 65
## 19 AM WEST "delayed " San Francisco 129
## 20 AM WEST "delayed " Seattle 61
Now filtering for delayed flights. There was an error when I was just using the filter function where it returned 0, so I used the function trimws in order to produce my required table.
final_data$`Flight Status` <- trimws(final_data$`Flight Status`)
delayed_data <- final_data %>%
filter(`Flight Status` == "delayed")
print(delayed_data)
## # A tibble: 10 × 4
## Airline `Flight Status` Location `Amount of flights`
## <chr> <chr> <chr> <int>
## 1 ALASKA delayed Los Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San Diego 20
## 4 ALASKA delayed San Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 AM WEST delayed Los Angeles 117
## 7 AM WEST delayed Phoenix 415
## 8 AM WEST delayed San Diego 65
## 9 AM WEST delayed San Francisco 129
## 10 AM WEST delayed Seattle 61
Using ggplot to look at the data for our delayed flights we see that AM West has the larger amount of delayed flights
g <- ggplot(delayed_data,aes(x = `Airline`,y = `Amount of flights`,fill = Airline ))+
geom_bar(stat='identity' ) +
labs(
title = "Airlines with Delayed Flights",
x = "Airline",
y = "Amount of Delayed Flights"
)
print (g)
Now in order to gain some insights using the statistical packages in R I
made the delayed text have its own column in order to look at the
different locations and see location has the most delays. As we can see
Phoenix has the highest average delays among both airlines.
delayed_statistics <- delayed_data %>%
pivot_wider(names_from = `Flight Status`, values_from = `Amount of flights`)
print(delayed_statistics)
## # A tibble: 10 × 3
## Airline Location delayed
## <chr> <chr> <int>
## 1 ALASKA Los Angeles 62
## 2 ALASKA Phoenix 12
## 3 ALASKA San Diego 20
## 4 ALASKA San Francisco 102
## 5 ALASKA Seattle 305
## 6 AM WEST Los Angeles 117
## 7 AM WEST Phoenix 415
## 8 AM WEST San Diego 65
## 9 AM WEST San Francisco 129
## 10 AM WEST Seattle 61
delayed_statistics %>%
group_by(Location) %>%
dplyr::summarise(max = max(delayed),
min=min(delayed),
mean=mean(delayed),
median=median(delayed)
)