library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# Get data from file I uploaded to github
theURL <- "https://raw.githubusercontent.com/bpersaud104/Data607/master/Data%20607%20Week%205%20Assignment.csv"
Airlines <- read.table(file = theURL, header = TRUE, sep = ",")
head(Airlines)
## X1 X2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Airlines_2 <- Airlines[-3, ]
Airlines_2[2,1] <- 'ALASKA'
Airlines_2[4,1] <- 'AM WEST'
Airlines_2
## X1 X2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
I removed the blank space seperating the two airlines and added the airlines names in the first column for the delayed rows since they were blank.
Airlines_tidy <- Airlines_2 %>%
rename('Airline' = 'X1', 'Airline_Status' = 'X2')
Airlines_tidy <- Airlines_tidy %>%
gather(key = Location, value = Flight_Count, 3:7)
# Get rid of the periods in the location's name
Airlines_tidy$Location <- str_replace_all(Airlines_tidy$Location,"\\.", " ")
Airlines_tidy
## Airline Airline_Status Location Flight_Count
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
I renamed X1 and X2 to Airline and Airline_Status to give them a proper column name that fits the data. The rest of the column names were all different locations, so to tidy the data I took all of them and put them into a single column that I named Location. Each row then had a different number to show the flights that were either on time or delayed. To tidy the data even more I took all the numbers and put them into a single column that I named Flight_Count.
Airlines_tidy %>%
group_by(Airline, Airline_Status) %>%
summarise(Total = sum(Flight_Count))
## # A tibble: 4 x 3
## # Groups: Airline [2]
## Airline Airline_Status Total
## <fct> <fct> <int>
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
We can see the number of delayed flights for both airlines. Alaska Airlines has less delays than AM West Airlines but AM West Airlines has way more flights. Let’s find the ratio of delayed flights for both airlines to get a better analysis.
Airlines_ratio <- Airlines_tidy %>%
group_by(Airline, Airline_Status) %>%
summarise(Total = sum(Flight_Count)) %>%
mutate(Ratio = Total / sum(Total))
Airlines_ratio
## # A tibble: 4 x 4
## # Groups: Airline [2]
## Airline Airline_Status Total Ratio
## <fct> <fct> <int> <dbl>
## 1 ALASKA delayed 501 0.133
## 2 ALASKA on time 3274 0.867
## 3 AM WEST delayed 787 0.109
## 4 AM WEST on time 6438 0.891
As you can see, the ratio of delayed flights is lower for Alaska Airlines compared to AM West Airlines. Alaska Airline has a delayed ratio of 13.27% and AM West has a delayed ratio of 10.90%.
Let’s look at the ratios by location.
Airlines_delay <- Airlines_tidy %>%
group_by(Airline, Airline_Status, Location) %>%
summarise(Total = sum(Flight_Count)) %>%
mutate(Ratio = Total / sum(Total))
Airlines_delay
## # A tibble: 20 x 5
## # Groups: Airline, Airline_Status [4]
## Airline Airline_Status Location Total Ratio
## <fct> <fct> <chr> <int> <dbl>
## 1 ALASKA delayed Los Angeles 62 0.124
## 2 ALASKA delayed Phoenix 12 0.0240
## 3 ALASKA delayed San Diego 20 0.0399
## 4 ALASKA delayed San Francisco 102 0.204
## 5 ALASKA delayed Seattle 305 0.609
## 6 ALASKA on time Los Angeles 497 0.152
## 7 ALASKA on time Phoenix 221 0.0675
## 8 ALASKA on time San Diego 212 0.0648
## 9 ALASKA on time San Francisco 503 0.154
## 10 ALASKA on time Seattle 1841 0.562
## 11 AM WEST delayed Los Angeles 117 0.149
## 12 AM WEST delayed Phoenix 415 0.527
## 13 AM WEST delayed San Diego 65 0.0826
## 14 AM WEST delayed San Francisco 129 0.164
## 15 AM WEST delayed Seattle 61 0.0775
## 16 AM WEST on time Los Angeles 694 0.108
## 17 AM WEST on time Phoenix 4840 0.752
## 18 AM WEST on time San Diego 383 0.0595
## 19 AM WEST on time San Francisco 320 0.0497
## 20 AM WEST on time Seattle 201 0.0312
Based on the data analysis, the number of delays vary by the number of flights the airline does and the ratio of delayed flights. Alaska Airlines has less delays throughout every city except for Seattle. But the ratio for Seattle is higher for AM West Airlines than Alaska Airlines. The total ratio for delays is higher for Alaska Airlines than AM West Airlines, despite Alaska Airlines having less total delays. In conclusion, it is safe to say that the more flights you have, the more delays you should expect to see as shown in the data. AM West Airlines has way more flights than Alaska Airlines and you can see that it has more delays as well.