library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 1.0.1
## v tibble 3.1.6 v dplyr 1.0.10
## v tidyr 1.2.0 v stringr 1.5.0
## v readr 2.1.2 v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Given a messy data, the goal is to tidy and transform data to suit the need for data analysis and to answer the question: Which airline provide a better service in term of flight quality?
readfile <- data.frame(read.csv("https://raw.githubusercontent.com/eddiexunyc/607_assignment_5/main/tidy%20data.csv", header =TRUE))
head(readfile)
## ï.. X Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 301
## 5 delayed 117 415 65 129 61
Since the csv file is untidy and does not follow the principle of tidy data where:
The data will be cleansed to reflect that.
##split the data based on the row number
alaska_df <- readfile[1:2,]
amwest_df <- readfile[4:5,]
After splitting the data by airline, each airline data will be cleaned up.
##rename the column
colnames(alaska_df) <- c("Airline", "Arrival Status", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle")
##fill in the null value with new value for Alaska
alaska_df$Airline[alaska_df$Airline == ""] <-c("ALASKA")
##pivot longer the city and value
alaska_tidy_data <- alaska_df %>%
pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle"), names_to = "City", values_to = "Count")
knitr::kable(alaska_tidy_data)
| Airline | Arrival Status | City | Count |
|---|---|---|---|
| ALASKA | on time | Los Angeles | 497 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | on time | San Diego | 212 |
| ALASKA | on time | San Francisco | 503 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Los Angeles | 62 |
| ALASKA | delayed | Phoenix | 12 |
| ALASKA | delayed | San Diego | 20 |
| ALASKA | delayed | San Francisco | 102 |
| ALASKA | delayed | Seattle | 305 |
##rename the column
colnames(amwest_df) <- c("Airline", "Arrival Status", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle")
##fill in the null value with new value
amwest_df$Airline[amwest_df$Airline == ""] <-c("AM WEST")
##pivot longer the city and value
amwest_tidy_data <- amwest_df %>%
pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle"), names_to = "City", values_to = "Count")
knitr::kable(amwest_tidy_data)
| Airline | Arrival Status | City | Count |
|---|---|---|---|
| AM WEST | on time | Los Angeles | 694 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | on time | San Diego | 383 |
| AM WEST | on time | San Francisco | 320 |
| AM WEST | on time | Seattle | 301 |
| AM WEST | delayed | Los Angeles | 117 |
| AM WEST | delayed | Phoenix | 415 |
| AM WEST | delayed | San Diego | 65 |
| AM WEST | delayed | San Francisco | 129 |
| AM WEST | delayed | Seattle | 61 |
##combine both amwest and alaska dataframe together
airline_tidy_data <- rbind(alaska_tidy_data, amwest_tidy_data)
knitr::kable(airline_tidy_data)
| Airline | Arrival Status | City | Count |
|---|---|---|---|
| ALASKA | on time | Los Angeles | 497 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | on time | San Diego | 212 |
| ALASKA | on time | San Francisco | 503 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Los Angeles | 62 |
| ALASKA | delayed | Phoenix | 12 |
| ALASKA | delayed | San Diego | 20 |
| ALASKA | delayed | San Francisco | 102 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | on time | Los Angeles | 694 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | on time | San Diego | 383 |
| AM WEST | on time | San Francisco | 320 |
| AM WEST | on time | Seattle | 301 |
| AM WEST | delayed | Los Angeles | 117 |
| AM WEST | delayed | Phoenix | 415 |
| AM WEST | delayed | San Diego | 65 |
| AM WEST | delayed | San Francisco | 129 |
| AM WEST | delayed | Seattle | 61 |
Now that the data is in a tidy format, the data analysis can be done on the airline data.
Let’s see the arrival status in a bar graph
#bar graph based on the arrival status by city.
ggplot(airline_tidy_data, aes(x = City, y= Count, fill = `Arrival Status`)) +
geom_bar(stat = "identity", position = "dodge")
As shown, Phoenix have the highest count for “On-time” arrival status, compared to other cities.
#summarize the data
airline_summary <- airline_tidy_data %>% group_by(`Airline`, `Arrival Status`) %>%
summarize_at('Count', mean)
head(airline_summary)
## # A tibble: 4 x 3
## # Groups: Airline [2]
## Airline `Arrival Status` Count
## <chr> <chr> <dbl>
## 1 ALASKA delayed 100.
## 2 ALASKA on time 655.
## 3 AM WEST delayed 157.
## 4 AM WEST on time 1308.
Comparison from a bar graph
ggplot(airline_summary, aes(x = `Airline`, y = `Count`, fill = `Arrival Status`)) +
geom_bar(stat = "identity", position = "dodge")
Based on the graph, it shows that the AM WEST airline have more “on-time” arrival status than the ALASKA airline. Of course, this is not a definitive proof that the AM WEST airline is better than ALASKA airline.
To see how well each airline performs, the ratio of “on-time” and “delayed” will be calculated to determine their performance in every cities.
#change the data format to include "on-time' and "delayed" column through pivot wider
airline_tidy_data2 <- airline_tidy_data %>%
pivot_wider(names_from = "Arrival Status", values_from = "Count")
airline_tidy_data2["Arrival Status Ratio"] <- round(airline_tidy_data2$`on time`/airline_tidy_data2$delayed, 2)
knitr::kable(airline_tidy_data2)
| Airline | City | on time | delayed | Arrival Status Ratio |
|---|---|---|---|---|
| ALASKA | Los Angeles | 497 | 62 | 8.02 |
| ALASKA | Phoenix | 221 | 12 | 18.42 |
| ALASKA | San Diego | 212 | 20 | 10.60 |
| ALASKA | San Francisco | 503 | 102 | 4.93 |
| ALASKA | Seattle | 1841 | 305 | 6.04 |
| AM WEST | Los Angeles | 694 | 117 | 5.93 |
| AM WEST | Phoenix | 4840 | 415 | 11.66 |
| AM WEST | San Diego | 383 | 65 | 5.89 |
| AM WEST | San Francisco | 320 | 129 | 2.48 |
| AM WEST | Seattle | 301 | 61 | 4.93 |
Chart view of the ratio
ggplot(airline_tidy_data2, aes(x = `City`, y = `Arrival Status Ratio`, color = `Airline`)) +
geom_point()
In conclusion, despite the AM WEST airline have more “on-time” arrival flight than the ALASKA airline, the latter airline performs better due to on-time/delayed ratio.