library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
# Get data from file I uploaded to github
theURL <- "https://raw.githubusercontent.com/bpersaud104/Data607/master/Data%20607%20Week%205%20Assignment.csv"
Airlines <- read.table(file = theURL, header = TRUE, sep = ",")
head(Airlines)
##        X1      X2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Data Tidying and Transformations

Airlines_2 <- Airlines[-3, ]
Airlines_2[2,1] <- 'ALASKA'
Airlines_2[4,1] <- 'AM WEST'
Airlines_2
##        X1      X2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

I removed the blank space seperating the two airlines and added the airlines names in the first column for the delayed rows since they were blank.

Airlines_tidy <- Airlines_2 %>% 
rename('Airline' = 'X1', 'Airline_Status' = 'X2')
Airlines_tidy <- Airlines_tidy %>%
gather(key = Location, value = Flight_Count, 3:7)
# Get rid of the periods in the location's name
Airlines_tidy$Location <- str_replace_all(Airlines_tidy$Location,"\\.", " ") 
Airlines_tidy
##    Airline Airline_Status      Location Flight_Count
## 1   ALASKA        on time   Los Angeles          497
## 2   ALASKA        delayed   Los Angeles           62
## 3  AM WEST        on time   Los Angeles          694
## 4  AM WEST        delayed   Los Angeles          117
## 5   ALASKA        on time       Phoenix          221
## 6   ALASKA        delayed       Phoenix           12
## 7  AM WEST        on time       Phoenix         4840
## 8  AM WEST        delayed       Phoenix          415
## 9   ALASKA        on time     San Diego          212
## 10  ALASKA        delayed     San Diego           20
## 11 AM WEST        on time     San Diego          383
## 12 AM WEST        delayed     San Diego           65
## 13  ALASKA        on time San Francisco          503
## 14  ALASKA        delayed San Francisco          102
## 15 AM WEST        on time San Francisco          320
## 16 AM WEST        delayed San Francisco          129
## 17  ALASKA        on time       Seattle         1841
## 18  ALASKA        delayed       Seattle          305
## 19 AM WEST        on time       Seattle          201
## 20 AM WEST        delayed       Seattle           61

I renamed X1 and X2 to Airline and Airline_Status to give them a proper column name that fits the data. The rest of the column names were all different locations, so to tidy the data I took all of them and put them into a single column that I named Location. Each row then had a different number to show the flights that were either on time or delayed. To tidy the data even more I took all the numbers and put them into a single column that I named Flight_Count.

Data Analysis

Airlines_tidy %>%
group_by(Airline, Airline_Status) %>%
summarise(Total = sum(Flight_Count))
## # A tibble: 4 x 3
## # Groups:   Airline [2]
##   Airline Airline_Status Total
##   <fct>   <fct>          <int>
## 1 ALASKA  delayed          501
## 2 ALASKA  on time         3274
## 3 AM WEST delayed          787
## 4 AM WEST on time         6438

We can see the number of delayed flights for both airlines. Alaska Airlines has less delays than AM West Airlines but AM West Airlines has way more flights. Let’s find the ratio of delayed flights for both airlines to get a better analysis.

Airlines_ratio <- Airlines_tidy %>%
group_by(Airline, Airline_Status) %>%
summarise(Total = sum(Flight_Count)) %>%
mutate(Ratio = Total / sum(Total))
Airlines_ratio
## # A tibble: 4 x 4
## # Groups:   Airline [2]
##   Airline Airline_Status Total Ratio
##   <fct>   <fct>          <int> <dbl>
## 1 ALASKA  delayed          501 0.133
## 2 ALASKA  on time         3274 0.867
## 3 AM WEST delayed          787 0.109
## 4 AM WEST on time         6438 0.891

As you can see, the ratio of delayed flights is lower for Alaska Airlines compared to AM West Airlines. Alaska Airline has a delayed ratio of 13.27% and AM West has a delayed ratio of 10.90%.

Let’s look at the ratios by location.

Airlines_delay <- Airlines_tidy %>%
group_by(Airline, Airline_Status, Location) %>%
summarise(Total = sum(Flight_Count)) %>%
mutate(Ratio = Total / sum(Total))
Airlines_delay
## # A tibble: 20 x 5
## # Groups:   Airline, Airline_Status [4]
##    Airline Airline_Status Location      Total  Ratio
##    <fct>   <fct>          <chr>         <int>  <dbl>
##  1 ALASKA  delayed        Los Angeles      62 0.124 
##  2 ALASKA  delayed        Phoenix          12 0.0240
##  3 ALASKA  delayed        San Diego        20 0.0399
##  4 ALASKA  delayed        San Francisco   102 0.204 
##  5 ALASKA  delayed        Seattle         305 0.609 
##  6 ALASKA  on time        Los Angeles     497 0.152 
##  7 ALASKA  on time        Phoenix         221 0.0675
##  8 ALASKA  on time        San Diego       212 0.0648
##  9 ALASKA  on time        San Francisco   503 0.154 
## 10 ALASKA  on time        Seattle        1841 0.562 
## 11 AM WEST delayed        Los Angeles     117 0.149 
## 12 AM WEST delayed        Phoenix         415 0.527 
## 13 AM WEST delayed        San Diego        65 0.0826
## 14 AM WEST delayed        San Francisco   129 0.164 
## 15 AM WEST delayed        Seattle          61 0.0775
## 16 AM WEST on time        Los Angeles     694 0.108 
## 17 AM WEST on time        Phoenix        4840 0.752 
## 18 AM WEST on time        San Diego       383 0.0595
## 19 AM WEST on time        San Francisco   320 0.0497
## 20 AM WEST on time        Seattle         201 0.0312

Conclusion

Based on the data analysis, the number of delays vary by the number of flights the airline does and the ratio of delayed flights. Alaska Airlines has less delays throughout every city except for Seattle. But the ratio for Seattle is higher for AM West Airlines than Alaska Airlines. The total ratio for delays is higher for Alaska Airlines than AM West Airlines, despite Alaska Airlines having less total delays. In conclusion, it is safe to say that the more flights you have, the more delays you should expect to see as shown in the data. AM West Airlines has way more flights than Alaska Airlines and you can see that it has more delays as well.