Open csv file from github
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
Flights <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/assignment-05/main/DATA605_05.csv")
head(Flights)
## X X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Flights_clean <- na.omit(Flights)
head(Flights_clean)
## X X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Add the Airlines name to the missing rows
# Duplicate Alaska into the second row
Flights_clean[2, "X"] <- Flights_clean[1, "X"]
# Duplicate AM WEST into the fourth row
Flights_clean[4, "X"] <- Flights_clean[3, "X"]
# Display the modified dataset
Flights_clean
## X X.1 Los.Angeles Phenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
COnvert the coloumns of cities into rows and values into one column
# Convert columns 3-7 into rows
Flights_data <- Flights_clean %>%
pivot_longer(cols = c(Los.Angeles:Seattle),
names_to = "City",
values_to = "Value")
# Display the modified dataset
Flights_data
## # A tibble: 20 × 4
## X X.1 City Value
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Francisco 129
## 20 AM WEST delayed Seattle 61
Rename first two columns
Flights_data <- Flights_data %>%
rename(
Airline = X,
Status = X.1,
Amount = Value)
Flights_data
## # A tibble: 20 × 4
## Airline Status City Amount
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Francisco 129
## 20 AM WEST delayed Seattle 61
Create a bar chart of delayed to total by airline in %
Flights_data <- Flights_data %>%
group_by( Airline) %>%
mutate(Percentage = Amount / sum(Amount) * 100)
ggplot(data= Flights_data, aes(x= Airline, y = Percentage, fill = Status))+
geom_bar(stat = "identity")
# Find the city with the most delays for each airline
Most_delayed_city <- Flights_data %>%
filter(Status == "delayed") %>%
group_by(Airline, City) %>%
summarise(Total_delays = Amount) %>%
top_n(3, Total_delays) # Select the city with the most delays for each airline
## `summarise()` has grouped output by 'Airline'. You can override using the
## `.groups` argument.
Most_delayed_city
## # A tibble: 6 × 3
## # Groups: Airline [2]
## Airline City Total_delays
## <chr> <chr> <int>
## 1 ALASKA Los.Angeles 62
## 2 ALASKA San.Francisco 102
## 3 ALASKA Seattle 305
## 4 AM WEST Los.Angeles 117
## 5 AM WEST Phenix 415
## 6 AM WEST San.Francisco 129
# Create the bar chart
ggplot(data = Most_delayed_city, aes(x = City, y = Total_delays, fill = Airline)) +
geom_bar(stat = "identity") +
labs(title = "Most Delayed City by Airline",
x = "City",
y = "Amount") +
theme_minimal()