library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
raw_data_path<-("https://raw.githubusercontent.com/amedina613/Data-607-Week-5-Assignment/main/Data%20607%20week%205%20raw.csv")
raw_data <- read.csv(raw_data_path, header=T)
print(raw_data)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM West on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
My goal is to have a final data set with the following column names: Airline, Destination, On time, and Delayed.
I will start off by removing that 3rd row.
raw_data <- slice(raw_data, -3)
print(raw_data)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
names(raw_data)[1:2] <- c("Airline", "Arrival")
raw_data[2, "Airline"] <- raw_data[1, "Airline"]
raw_data[4, "Airline"] <- raw_data[3, "Airline"]
print(raw_data)
## Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
I want to transpose the destination information and make a new column called “Destination”. I’ll use pivot_longer from tidyr for this.
raw_data <- raw_data %>%
pivot_longer(cols = c(Los.Angeles, Phoenix, San.Diego, San.Francisco, Seattle), names_to = "Destination", values_to = "num_on_time")
print(raw_data)
## # A tibble: 20 × 4
## Airline Arrival Destination num_on_time
## <chr> <chr> <chr> <int>
## 1 Alaska on time Los.Angeles 497
## 2 Alaska on time Phoenix 221
## 3 Alaska on time San.Diego 212
## 4 Alaska on time San.Francisco 503
## 5 Alaska on time Seattle 1841
## 6 Alaska delayed Los.Angeles 62
## 7 Alaska delayed Phoenix 12
## 8 Alaska delayed San.Diego 20
## 9 Alaska delayed San.Francisco 102
## 10 Alaska delayed Seattle 305
## 11 AM West on time Los.Angeles 694
## 12 AM West on time Phoenix 4840
## 13 AM West on time San.Diego 383
## 14 AM West on time San.Francisco 320
## 15 AM West on time Seattle 201
## 16 AM West delayed Los.Angeles 117
## 17 AM West delayed Phoenix 415
## 18 AM West delayed San.Diego 65
## 19 AM West delayed San.Francisco 129
## 20 AM West delayed Seattle 61
raw_data<- raw_data %>%
pivot_wider(names_from = Arrival, values_from = num_on_time)
print(raw_data)
## # A tibble: 10 × 4
## Airline Destination `on time` delayed
## <chr> <chr> <int> <int>
## 1 Alaska Los.Angeles 497 62
## 2 Alaska Phoenix 221 12
## 3 Alaska San.Diego 212 20
## 4 Alaska San.Francisco 503 102
## 5 Alaska Seattle 1841 305
## 6 AM West Los.Angeles 694 117
## 7 AM West Phoenix 4840 415
## 8 AM West San.Diego 383 65
## 9 AM West San.Francisco 320 129
## 10 AM West Seattle 201 61
airline_del <- ggplot(raw_data, aes(x = Airline, y = delayed, fill = Airline)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Delayed Flights for Different Airlines",
x = "Airline", y = "Number of Delayed Flights") +
theme_minimal()
print(airline_del)
AM West has significantly more flight delays than Alaska Airlines
airline_ot <- ggplot(raw_data, aes(x = Airline, y = `on time`, fill = Airline)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of On Time Flights for Different Airlines",
x = "Airline", y = "On-time Flights") +
theme_minimal()
print(airline_ot)
AM West has significantly more on time arrivals than Alaska airlines.
airline_data <- raw_data %>%
group_by(Airline) %>%
summarise(
max = max(delayed),
min = min(delayed),
mean = mean(delayed),
median = median(delayed),
standard_deviation = sd(delayed)
)
print(airline_data)
## # A tibble: 2 × 6
## Airline max min mean median standard_deviation
## <chr> <int> <int> <dbl> <int> <dbl>
## 1 AM West 415 61 157. 117 147.
## 2 Alaska 305 12 100. 62 120.
AM west has much greater mean and median of delayed flights than Alaska airlines as well as higher maximum and minimum of delayed flights.