untidy_data <- "https://raw.githubusercontent.com/Kingtilon1/DATA607/main/assignment5/flights.csv"
untidy <- data.frame(read.csv(untidy_data, header=TRUE, sep="\t"))
untidy
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
I am going to give the table a name
colnames(untidy)[1] <- "Airline"
colnames(untidy)[2] <-"Tardiness"
I will now drop the row that has the na values
untidy <- drop_na(untidy)
fill the empty rows with NA
untidy[untidy==""] <-NA
tidy <- untidy %>% fill(Airline)
tidy
## Airline Tardiness Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
using the pivot_longe function to reshape the data frame to a long format so that the columns from 3 to 7 are all organized under one column, as is their respective values
tidy <- pivot_longer(tidy, cols = 3:7, names_to = "city", values_to = "Frequency")
replacing all the . in between the cities
tidy$city <- str_replace_all(tidy$city, "\\.", " ")
tidy
## # A tibble: 20 × 4
## Airline Tardiness city Frequency
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
By calclating the total amount of flights and the total amount of delayed, and ontime flight, I can calculate the on time rate and the delayed rate for each airline
airline_percentage <-
tidy %>%
group_by(Airline) %>%
summarise(
num_flights = sum(Frequency),
num_delays = sum(ifelse(Tardiness=='delayed', Frequency, 0))
)
airline_percentage <-
airline_percentage %>%
mutate(
num_on_time = num_flights - num_delays,
delay_rate = num_delays / num_flights,
on_time_rate = (num_flights - num_delays) / num_flights
) %>%
arrange(delay_rate)
airline_percentage
## # A tibble: 2 × 6
## Airline num_flights num_delays num_on_time delay_rate on_time_rate
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 AM WEST 7225 787 6438 0.109 0.891
## 2 ALASKA 3775 501 3274 0.133 0.867
Alaska Airlines on time rate: 86.7 percent, delayed rate 13.2% Based off of this data, I can conclude, that on average, AM WEST Airlines is on time 3% more than Alaska airlines for a total of 89%.As well as the fact that AM WEST Airlines has a lower delay rate than Alaska Airlines which is also lower by 2 percent for a total of 11 percent if you round up.