library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Creating a .CSV file and Read data
#create CSV
csv <- rbind(c(NA,NA , "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
c(NA, "Delayed", 62, 12, 20, 102, 305),
c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
c(NA, "Delayed", 117, 415, 65, 129, 61))
write.table(csv, file = "Wk3Assignment_Data_607.csv", sep = ",", col.names=F, row.names=F)
#read data using dplyr and tidyr
flight_data <- read.csv(paste0("Wk3Assignment_Data_607.csv"),
stringsAsFactors = F); flight_data
## NA. NA..1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA On Time 497 221 212 503 1841
## 2 <NA> Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 <NA> Delayed 117 415 65 129 61
Tidying Data
#tidy by Airline and Status (delay or on time) with use of tidyr and dplyr
long_data2 <- flight_data %>%
mutate(Airlines1 = NA., Airlines2=lag(NA.)) %>%
mutate(Airline = coalesce(Airlines1,Airlines2), Status = NA..1) %>%
gather("Destination", "Flights", 3:7) %>%
select(Airline:Flights) %>%
arrange(Airline, desc(Status), Destination); long_data2
## Airline Status Destination Flights
## 1 ALASKA On Time Los.Angeles 497
## 2 ALASKA On Time Phoenix 221
## 3 ALASKA On Time San.Diego 212
## 4 ALASKA On Time San.Francisco 503
## 5 ALASKA On Time Seattle 1841
## 6 ALASKA Delayed Los.Angeles 62
## 7 ALASKA Delayed Phoenix 12
## 8 ALASKA Delayed San.Diego 20
## 9 ALASKA Delayed San.Francisco 102
## 10 ALASKA Delayed Seattle 305
## 11 AM WEST On Time Los.Angeles 694
## 12 AM WEST On Time Phoenix 4840
## 13 AM WEST On Time San.Diego 383
## 14 AM WEST On Time San.Francisco 320
## 15 AM WEST On Time Seattle 201
## 16 AM WEST Delayed Los.Angeles 117
## 17 AM WEST Delayed Phoenix 415
## 18 AM WEST Delayed San.Diego 65
## 19 AM WEST Delayed San.Francisco 129
## 20 AM WEST Delayed Seattle 61
#tidy by cities
flightdata2 <- flight_data %>% gather("city","no.flights", 3:7)
flightdata2
## NA. NA..1 city no.flights
## 1 ALASKA On Time Los.Angeles 497
## 2 <NA> Delayed Los.Angeles 62
## 3 AM WEST On Time Los.Angeles 694
## 4 <NA> Delayed Los.Angeles 117
## 5 ALASKA On Time Phoenix 221
## 6 <NA> Delayed Phoenix 12
## 7 AM WEST On Time Phoenix 4840
## 8 <NA> Delayed Phoenix 415
## 9 ALASKA On Time San.Diego 212
## 10 <NA> Delayed San.Diego 20
## 11 AM WEST On Time San.Diego 383
## 12 <NA> Delayed San.Diego 65
## 13 ALASKA On Time San.Francisco 503
## 14 <NA> Delayed San.Francisco 102
## 15 AM WEST On Time San.Francisco 320
## 16 <NA> Delayed San.Francisco 129
## 17 ALASKA On Time Seattle 1841
## 18 <NA> Delayed Seattle 305
## 19 AM WEST On Time Seattle 201
## 20 <NA> Delayed Seattle 61
Performing analysis to compare the arrival delays for the two airlines
Perform <- long_data2 %>%
group_by(Airline) %>%
mutate(A_Total = sum(Flights)) %>%
group_by(Airline, Status) %>%
mutate(S_Total = sum(Flights), A_Perf = S_Total / A_Total) %>%
group_by(Airline, Destination) %>%
mutate(D_Total = sum(Flights), D_Perf = Flights / D_Total)
# both airlines AL and AM Overall Performance
data.frame(Perform[c(1,10,11,20), c(1,2,7)])
## Airline Status A_Perf
## 1 ALASKA On Time 0.8672848
## 2 ALASKA Delayed 0.1327152
## 3 AM WEST On Time 0.8910727
## 4 AM WEST Delayed 0.1089273
# Alaska Airlines and AM West by Destination
Perf_by_Destination <- data.frame(Perform %>%
filter(Status == "On Time") %>%
group_by(Destination) %>%
unite(temp, D_Total, D_Perf) %>%
spread(Status, temp) %>%
select(Destination, Airline, On_Time = `On Time`) %>%
spread(Airline, On_Time) %>%
separate(ALASKA, into=c("Flights.AL", "Perf.AL"), sep = "_", convert=T) %>%
separate(`AM WEST`, into=c("Flights.AM", "Perf.AM"), sep = "_", convert=T) %>%
mutate(Difference = Perf.AL - Perf.AM) %>%
arrange(Destination)); Perf_by_Destination
## Destination Flights.AL Perf.AL Flights.AM Perf.AM Difference
## 1 Los.Angeles 559 0.8890877 811 0.8557337 0.03335399
## 2 Phoenix 233 0.9484979 5255 0.9210276 0.02747026
## 3 San.Diego 232 0.9137931 448 0.8549107 0.05888239
## 4 San.Francisco 605 0.8314050 449 0.7126949 0.11871008
## 5 Seattle 2146 0.8578751 262 0.7671756 0.09069954