1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
  2. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
  3. Perform analysis to compare the arrival delays for the two airlines.
  4. Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Creating a .CSV file and Read data

#create CSV
csv <- rbind(c(NA,NA , "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
             c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
             c(NA, "Delayed", 62, 12, 20, 102, 305),
             c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
             c(NA, "Delayed", 117, 415, 65, 129, 61))

write.table(csv, file = "Wk3Assignment_Data_607.csv", sep = ",", col.names=F, row.names=F)

#read data using dplyr and tidyr
flight_data <- read.csv(paste0("Wk3Assignment_Data_607.csv"), 
                             stringsAsFactors = F); flight_data
##       NA.   NA..1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time         497     221       212           503    1841
## 2    <NA> Delayed          62      12        20           102     305
## 3 AM WEST On Time         694    4840       383           320     201
## 4    <NA> Delayed         117     415        65           129      61

 

Tidying Data

#tidy by Airline and Status (delay or on time) with use of tidyr and dplyr
long_data2 <- flight_data %>%
mutate(Airlines1 = NA., Airlines2=lag(NA.)) %>% 
mutate(Airline = coalesce(Airlines1,Airlines2), Status = NA..1)  %>%
gather("Destination", "Flights", 3:7) %>% 
select(Airline:Flights) %>% 
arrange(Airline, desc(Status), Destination); long_data2
##    Airline  Status   Destination Flights
## 1   ALASKA On Time   Los.Angeles     497
## 2   ALASKA On Time       Phoenix     221
## 3   ALASKA On Time     San.Diego     212
## 4   ALASKA On Time San.Francisco     503
## 5   ALASKA On Time       Seattle    1841
## 6   ALASKA Delayed   Los.Angeles      62
## 7   ALASKA Delayed       Phoenix      12
## 8   ALASKA Delayed     San.Diego      20
## 9   ALASKA Delayed San.Francisco     102
## 10  ALASKA Delayed       Seattle     305
## 11 AM WEST On Time   Los.Angeles     694
## 12 AM WEST On Time       Phoenix    4840
## 13 AM WEST On Time     San.Diego     383
## 14 AM WEST On Time San.Francisco     320
## 15 AM WEST On Time       Seattle     201
## 16 AM WEST Delayed   Los.Angeles     117
## 17 AM WEST Delayed       Phoenix     415
## 18 AM WEST Delayed     San.Diego      65
## 19 AM WEST Delayed San.Francisco     129
## 20 AM WEST Delayed       Seattle      61
#tidy by cities
flightdata2 <- flight_data %>% gather("city","no.flights", 3:7)
flightdata2
##        NA.   NA..1          city no.flights
## 1   ALASKA On Time   Los.Angeles        497
## 2     <NA> Delayed   Los.Angeles         62
## 3  AM WEST On Time   Los.Angeles        694
## 4     <NA> Delayed   Los.Angeles        117
## 5   ALASKA On Time       Phoenix        221
## 6     <NA> Delayed       Phoenix         12
## 7  AM WEST On Time       Phoenix       4840
## 8     <NA> Delayed       Phoenix        415
## 9   ALASKA On Time     San.Diego        212
## 10    <NA> Delayed     San.Diego         20
## 11 AM WEST On Time     San.Diego        383
## 12    <NA> Delayed     San.Diego         65
## 13  ALASKA On Time San.Francisco        503
## 14    <NA> Delayed San.Francisco        102
## 15 AM WEST On Time San.Francisco        320
## 16    <NA> Delayed San.Francisco        129
## 17  ALASKA On Time       Seattle       1841
## 18    <NA> Delayed       Seattle        305
## 19 AM WEST On Time       Seattle        201
## 20    <NA> Delayed       Seattle         61


Performing analysis to compare the arrival delays for the two airlines

Perform <- long_data2 %>%
  group_by(Airline) %>%
  mutate(A_Total = sum(Flights)) %>% 
  group_by(Airline, Status) %>%
  mutate(S_Total = sum(Flights), A_Perf = S_Total / A_Total) %>% 
  group_by(Airline, Destination) %>%
  mutate(D_Total = sum(Flights), D_Perf = Flights / D_Total)

# both airlines AL and AM Overall Performance
data.frame(Perform[c(1,10,11,20), c(1,2,7)])
##   Airline  Status    A_Perf
## 1  ALASKA On Time 0.8672848
## 2  ALASKA Delayed 0.1327152
## 3 AM WEST On Time 0.8910727
## 4 AM WEST Delayed 0.1089273
# Alaska Airlines and AM West by Destination
Perf_by_Destination <- data.frame(Perform %>%
  filter(Status == "On Time") %>% 
  group_by(Destination) %>% 
  unite(temp, D_Total, D_Perf) %>%
  spread(Status, temp) %>%
  select(Destination, Airline, On_Time = `On Time`) %>% 
  spread(Airline, On_Time) %>%
  separate(ALASKA, into=c("Flights.AL", "Perf.AL"), sep = "_", convert=T) %>% 
  separate(`AM WEST`, into=c("Flights.AM", "Perf.AM"), sep = "_", convert=T) %>% 
  mutate(Difference = Perf.AL - Perf.AM)  %>% 
  arrange(Destination)); Perf_by_Destination
##     Destination Flights.AL   Perf.AL Flights.AM   Perf.AM Difference
## 1   Los.Angeles        559 0.8890877        811 0.8557337 0.03335399
## 2       Phoenix        233 0.9484979       5255 0.9210276 0.02747026
## 3     San.Diego        232 0.9137931        448 0.8549107 0.05888239
## 4 San.Francisco        605 0.8314050        449 0.7126949 0.11871008
## 5       Seattle       2146 0.8578751        262 0.7671756 0.09069954