library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Data 607 Assignment 5

Given a messy data, the goal is to tidy and transform data to suit the need for data analysis and to answer the question: Which airline provide a better service in term of flight quality?

Read CSV

readfile <- data.frame(read.csv("https://raw.githubusercontent.com/eddiexunyc/607_assignment_5/main/tidy%20data.csv", header =TRUE))

head(readfile)
##       ï..       X Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     301
## 5         delayed         117     415        65           129      61

Data Cleaning

Since the csv file is untidy and does not follow the principle of tidy data where:

  1. Every column is a variable
  2. Every row is an observation
  3. Every cell is a single value

The data will be cleansed to reflect that.

##split the data based on the row number
alaska_df <- readfile[1:2,]
amwest_df <- readfile[4:5,]

After splitting the data by airline, each airline data will be cleaned up.

##rename the column 
colnames(alaska_df) <- c("Airline", "Arrival Status", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle")

##fill in the null value with new value for Alaska
alaska_df$Airline[alaska_df$Airline == ""] <-c("ALASKA")

##pivot longer the city and value
alaska_tidy_data <- alaska_df %>%
  pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle"), names_to = "City", values_to = "Count")

knitr::kable(alaska_tidy_data)
Airline Arrival Status City Count
ALASKA on time Los Angeles 497
ALASKA on time Phoenix 221
ALASKA on time San Diego 212
ALASKA on time San Francisco 503
ALASKA on time Seattle 1841
ALASKA delayed Los Angeles 62
ALASKA delayed Phoenix 12
ALASKA delayed San Diego 20
ALASKA delayed San Francisco 102
ALASKA delayed Seattle 305
##rename the column
colnames(amwest_df) <- c("Airline", "Arrival Status", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle")

##fill in the null value with new value
amwest_df$Airline[amwest_df$Airline == ""] <-c("AM WEST")

##pivot longer the city and value
amwest_tidy_data <- amwest_df %>%
  pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle"), names_to = "City", values_to = "Count")

knitr::kable(amwest_tidy_data)
Airline Arrival Status City Count
AM WEST on time Los Angeles 694
AM WEST on time Phoenix 4840
AM WEST on time San Diego 383
AM WEST on time San Francisco 320
AM WEST on time Seattle 301
AM WEST delayed Los Angeles 117
AM WEST delayed Phoenix 415
AM WEST delayed San Diego 65
AM WEST delayed San Francisco 129
AM WEST delayed Seattle 61
##combine both amwest and alaska dataframe together
airline_tidy_data <- rbind(alaska_tidy_data, amwest_tidy_data)

knitr::kable(airline_tidy_data)
Airline Arrival Status City Count
ALASKA on time Los Angeles 497
ALASKA on time Phoenix 221
ALASKA on time San Diego 212
ALASKA on time San Francisco 503
ALASKA on time Seattle 1841
ALASKA delayed Los Angeles 62
ALASKA delayed Phoenix 12
ALASKA delayed San Diego 20
ALASKA delayed San Francisco 102
ALASKA delayed Seattle 305
AM WEST on time Los Angeles 694
AM WEST on time Phoenix 4840
AM WEST on time San Diego 383
AM WEST on time San Francisco 320
AM WEST on time Seattle 301
AM WEST delayed Los Angeles 117
AM WEST delayed Phoenix 415
AM WEST delayed San Diego 65
AM WEST delayed San Francisco 129
AM WEST delayed Seattle 61

Data Analysis

Now that the data is in a tidy format, the data analysis can be done on the airline data.

Let’s see the arrival status in a bar graph

#bar graph based on the arrival status by city.
ggplot(airline_tidy_data, aes(x = City, y= Count, fill = `Arrival Status`)) +
  geom_bar(stat = "identity", position = "dodge")

As shown, Phoenix have the highest count for “On-time” arrival status, compared to other cities.

#summarize the data
airline_summary <- airline_tidy_data %>% group_by(`Airline`, `Arrival Status`) %>% 
  summarize_at('Count', mean)

head(airline_summary)
## # A tibble: 4 x 3
## # Groups:   Airline [2]
##   Airline `Arrival Status` Count
##   <chr>   <chr>            <dbl>
## 1 ALASKA  delayed           100.
## 2 ALASKA  on time           655.
## 3 AM WEST delayed           157.
## 4 AM WEST on time          1308.

Comparison from a bar graph

ggplot(airline_summary, aes(x = `Airline`, y = `Count`, fill = `Arrival Status`)) +
  geom_bar(stat = "identity", position = "dodge")

Based on the graph, it shows that the AM WEST airline have more “on-time” arrival status than the ALASKA airline. Of course, this is not a definitive proof that the AM WEST airline is better than ALASKA airline.

To see how well each airline performs, the ratio of “on-time” and “delayed” will be calculated to determine their performance in every cities.

#change the data format to include "on-time' and "delayed" column through pivot wider
airline_tidy_data2 <- airline_tidy_data %>%
  pivot_wider(names_from = "Arrival Status", values_from = "Count")

airline_tidy_data2["Arrival Status Ratio"] <- round(airline_tidy_data2$`on time`/airline_tidy_data2$delayed, 2)

knitr::kable(airline_tidy_data2)
Airline City on time delayed Arrival Status Ratio
ALASKA Los Angeles 497 62 8.02
ALASKA Phoenix 221 12 18.42
ALASKA San Diego 212 20 10.60
ALASKA San Francisco 503 102 4.93
ALASKA Seattle 1841 305 6.04
AM WEST Los Angeles 694 117 5.93
AM WEST Phoenix 4840 415 11.66
AM WEST San Diego 383 65 5.89
AM WEST San Francisco 320 129 2.48
AM WEST Seattle 301 61 4.93

Chart view of the ratio

ggplot(airline_tidy_data2, aes(x = `City`, y = `Arrival Status Ratio`, color = `Airline`)) +
  geom_point()

Conclusion

In conclusion, despite the AM WEST airline have more “on-time” arrival flight than the ALASKA airline, the latter airline performs better due to on-time/delayed ratio.